def test_crawl_mock_pages_all_products_no_repetitions(self): mock_params = { '/produto_inicial/p': ('Pagina Inicial', 'Produto Inicial', 'produto_1/p', 'produto_2/p', 'produto_3/p'), '/produto_1/p': ('Pagina Produto 1', 'Produto 1', 'produto_4/p', 'produto_5/p', 'produto_6/p'), '/produto_2/p': ('Pagina Produto 2', 'Produto 2', 'produto_7/p', 'produto_8/p', 'produto_9/p'), '/produto_3/p': ('Pagina Produto 3', 'Produto 3', 'produto_10/p', 'produto_11/p', 'produto_12/p') } with patch('crawler.get_page_contents', MockPageGenerator(mock_params)): crawler.main(['-d', '1', '-o', 'teste.csv', '/produto_inicial/p']) expected = [[ 'Produto Inicial', 'Pagina Inicial', 'http://www.epocacosmeticos.com.br/produto_inicial/p' ], [ 'Produto 1', 'Pagina Produto 1', 'http://www.epocacosmeticos.com.br/produto_1/p' ], [ 'Produto 2', 'Pagina Produto 2', 'http://www.epocacosmeticos.com.br/produto_2/p' ], [ 'Produto 3', 'Pagina Produto 3', 'http://www.epocacosmeticos.com.br/produto_3/p' ]] self.assertEqual(expected, self.load_result_csv())
def test_crawl_mock_pages_mixed_with_repetitions(self): mock_params = { '/pagina_inicial': ('Pagina Inicial', 'Produto Inicial', 'produto_1/p', 'pagina_2', 'produto_3/p'), '/produto_1/p': ('Pagina Produto 1', 'Produto 1', 'produto_3/p', 'pagina_5', 'pagina_6'), '/pagina_2': ('Pagina 2', 'Página 2', 'produto_1/p', 'pagina_5', 'produto_3/p'), '/produto_3/p': ('Pagina Produto 3', 'Produto 3', 'produto_1/p', 'pagina_5', 'pagina_6'), '/pagina_5': ('Pagina 5', 'Página 2', 'produto_1/p', 'pagina_5', 'produto_3/p'), '/pagina_6': ('Pagina 6', 'Página 2', 'produto_1/p', 'pagina_5', 'produto_3/p'), } with patch('crawler.get_page_contents', MockPageGenerator(mock_params)): crawler.main(['-d', '2', '-o', 'teste.csv', '/pagina_inicial']) expected = [[ 'Produto 1', 'Pagina Produto 1', 'http://www.epocacosmeticos.com.br/produto_1/p' ], [ 'Produto 3', 'Pagina Produto 3', 'http://www.epocacosmeticos.com.br/produto_3/p' ]] self.assertEqual(expected, self.load_result_csv())
def test_crawl_lady_million(self): url = '/lady-million-eau-my-gold-eau-de-toilette-paco-rabanne-perfume-feminino/p' crawler.main(['-d', '0', '-o', 'teste.csv', url]) expected = [[ 'Lady Million Eau my Gold Eau de Toilette Paco Rabanne - Perfume Feminino', 'Perfume Lady Million Eau my Gold EDT Paco Rabanne Feminino - Época Cosméticos', 'http://www.epocacosmeticos.com.br' + url ]] self.assertEqual(expected, self.load_result_csv())
def test_crawl_hypnose(self): url = '/hypnose-eau-de-toilette-lancome-perfume-feminino/p' crawler.main(['-d', '0', '-o', 'teste.csv', url]) expected = [[ 'Hypnôse Eau de Toilette Lancôme - Perfume Feminino - 30ml', 'Hypnôse Lancôme - Perfume Feminino - Época Cosméticos', 'http://www.epocacosmeticos.com.br' + url ]] self.assertEqual(expected, self.load_result_csv())
def main(): args = parse_options() if args.crawl or args.serve: import crawler crawler.main() if args.build or args.serve: import builder builder.main()
def main(): if len(sys.argv) != 2: usage() sys.exit(1) if sys.argv[1] == 'serve': server.main() elif sys.argv[1] == 'crawl': crawler.main() else: usage() sys.exit(1)
def test_crawler(): """Testing index route""" crawler.main(test=True) with open('repos_test.pickle', 'rb') as f: list_of_repos = pickle.load(f) assert len(list_of_repos) > 0, 'No repo saved in pickle file!' assert list_of_repos[0]['repo_url'] != '', 'Repo URL empty!' assert list_of_repos[0]['repo_name'] != '', 'Repo Name empty!' assert list_of_repos[0]['stars'] != '', 'Repo has no stars!' assert list_of_repos[0]['forks'] != '', 'Repo has no forks!' # Teardown os.remove('repos_test.pickle')
def test_crawl_mock_pages_no_product(self): mock_params = { '/pagina_inicial': ('Pagina Inicial', 'Produto Inicial', 'pagina_1', 'pagina_2', 'pagina_3'), '/pagina_1': ('Pagina Produto 1', 'Produto 1', 'pagina_4', 'pagina_5', 'pagina_6'), '/pagina_2': ('Pagina Produto 2', 'Produto 2', 'pagina_7', 'pagina_8', 'pagina_9'), '/pagina_3': ('Pagina Produto 3', 'Produto 3', 'pagina_10', 'pagina_11', 'pagina_12') } with patch('crawler.get_page_contents', MockPageGenerator(mock_params)): crawler.main(['-d', '1', '-o', 'teste.csv', '/pagina_inicial']) expected = [] self.assertEqual(expected, self.load_result_csv())
class Main: gazetteer = sys.argv[1] dataset = sys.argv[2] annotatedEntities = sys.argv[3] vocabularyFile = sys.argv[4] crawler.main(gazetteer, dataset, annotatedEntities, vocabularyFile)
def main(url, settings): page = crawler.main(url, actions)[0] # with open('./html.txt', 'r') as f: # page = f.read() scraper = scraping(page, settings) scraper.setStatus() scraper.displaySettings()
def main(): i = 1 while (i <= len(sys.argv) - 1): fn = str(sys.argv[i]) read_and_save(fn) i += 1 # if (os.path.isfile("Output.txt")): # uniqlines = set(open("Output.txt").readlines()) # fill = open("Output.txt", 'w').writelines(set(uniqlines)) # Solution without messing with order # lines_seen = set() # holds lines already seen # outfile = open('Output.txt', "w") # for line in open('Output.txt', "r"): # if line not in lines_seen: # not a duplicate # outfile.write(line) # lines_seen.add(line) # outfile.close() crawler.main()
def scanner(): form = Scanner() if form.validate_on_submit(): flash('Scanning URL="%s"' % (form.seed_url.data)) o = urlparse(form.seed_url.data) if o.scheme == 'http' or o.scheme == 'https': flash('Valid URL !') obj = main(form.seed_url.data) #XSS_Module(form.seed_url.data,obj) SQL_Module(form.seed_url.data, obj) else: flash('Invalid URL!') return render_template('scanner.html', title='Scanner', form=form)
def scanner(): form = Scanner() if form.validate_on_submit(): flash('Scanning URL="%s"' % (form.seed_url.data)) o = urlparse(form.seed_url.data) if o.scheme == "http" or o.scheme == "https": flash("Valid URL !") obj = main(form.seed_url.data) # XSS_Module(form.seed_url.data,obj) SQL_Module(form.seed_url.data, obj) else: flash("Invalid URL!") return render_template("scanner.html", title="Scanner", form=form)
def scanner(): form = Scanner() if form.validate_on_submit(): flash('Seed URL="%s"' % (form.seed_url.data)) o = urlparse(form.seed_url.data) if o.scheme == 'http' or o.scheme == 'https': option = form.example1.data obj = main(form.seed_url.data) flash("Total # urls found: " + str(len(obj.getUrlList()))) if len(option) == 2: SQL_Module(obj) XSS_Module(obj) elif len(option) == 1: if option[0] == 'XSS': XSS_Module(obj) elif option[0] == 'SQL': SQL_Module(obj) else: flash('Invalid URL!') return render_template('scanner.html', title='Scanner', form=form)
def scanner(): form = Scanner() if form.validate_on_submit(): flash('Seed URL="%s"' %(form.seed_url.data)) o = urlparse(form.seed_url.data) if o.scheme=='http' or o.scheme=='https': option=form.example1.data obj=main(form.seed_url.data) flash("Total # urls found: " + str(len(obj.getUrlList()))) if len(option)==2: SQL_Module(obj) XSS_Module(obj) elif len(option)==1: if option[0]=='XSS': XSS_Module(obj) elif option[0]=='SQL': SQL_Module(obj) else : flash('Invalid URL!'); return render_template('scanner.html', title='Scanner', form=form)
def crawl(): global a global b global t a, b, t = crawler.main() return jsonify(rows=b, time=t)
def test_crawl_doubled_id_page(self): url = '/mascara-reestruturadora-monoi-e-argan-nick-vick-mascara-para-cabelos-quimicamente-tratados/p' crawler.main(['-d', '2', '-o', 'teste.csv', url]) self.assertLessEqual(1800, len(self.load_result_csv()))
def test_crawl_malformed_url(self): url = '/cabelos/coloracao/tintura-para-cabelos/Sem Amônia' crawler.main(['-d', '0', '-o', 'teste.csv', url]) self.assertEqual(0, len(self.load_result_csv()))
temp_pd = pandas.read_csv("pid.csv") n = temp_pd.shape[0] while True: time_start = time.time() # 建立代理池 ippool = buildip.buildippool() #ippool=[{}] # 测试专用行 print('*************************开始爬取%s个地点的微博*********************' % str(n)) #建立进程 for i in range(n): crawler.main(i, ippool, yag, emailname) time_end = time.time() print(' time cost ', time_end - time_start, 's') print('***********************休息三小时再继续爬********************') conn = sqlite3.connect('weibo.sqlite') weibo_pd = pandas.read_sql_query("SELECT * FROM weibo", conn) wb_detail = weibo_pd['place'].value_counts().to_dict() wb_m = weibo_pd.shape[0] pic_pd = pandas.read_sql_query("SELECT * FROM pic", conn) pic_m = pic_pd.shape[0] conn.close() yag.send( to=[emailname], subject='All Done',
# coding:utf-8 # version:python 3.7 # author:Ivy import crawler ############## 自主设置区 ############################# cookie='' # 你自己的cookie mid='' # 需要爬取的微博id type='' # repost还是comment ###################################################### result=crawler.main(mid,type,cookie) if result==0: print('type输入错了,修改后重新运行!') else: print('爬完了')
def setup_window(): global window # Main window window.title('spidy Web Crawler - by rivermont') window.iconbitmap('{0}\\media\\favicon.ico'.format(CRAWLER_DIR)) overwrite = BooleanVar() raise_errors = BooleanVar() save_pages = BooleanVar() zip_files_ = BooleanVar() save_words = BooleanVar() # todo_file = StringVar() # done_file = StringVar() # bad_file = StringVar() # word_file = StringVar() save_count = IntVar() max_new_errors = IntVar() max_http_errors = IntVar() max_known_errors = IntVar() max_new_mimes = IntVar() # custom_headers = StringVar() # Frame to fill main window main_frame = ttk.Frame(window, padding='4') main_frame.grid(column=0, row=0, sticky=(N, W, E, S)) main_frame.columnconfigure(0, weight=1) main_frame.rowconfigure(0, weight=1) # Container to hold variable settings setting_box = ttk.Frame(main_frame, padding='4', borderwidth=1, relief='solid') setting_box.grid(column=0, row=0, sticky=(N, S, W)) setting_box.columnconfigure(0, weight=1) setting_box.rowconfigure(0, weight=1) # Container for things on the right side of the main window right_bar = ttk.Frame(main_frame, padding='4', borderwidth=1, relief='solid') right_bar.grid(column=1, row=0, sticky=(N, S, E)) right_bar.columnconfigure(2, weight=1) right_bar.rowconfigure(0, weight=1) # Container for controlling the crawler control_box = ttk.Frame(right_bar, padding='4', borderwidth=1, relief='solid') control_box.grid(column=1, row=0, sticky=(N, E, W)) control_box.columnconfigure(1, weight=1) control_box.rowconfigure(0, weight=1) # Container for the status elements status_box = ttk.Frame(right_bar, padding='4', borderwidth=1, relief='solid') status_box.grid(column=0, row=1, sticky=(E, W)) status_box.columnconfigure(0, weight=1) status_box.rowconfigure(1, weight=1) # Container for the console log console_box = ttk.Frame(right_bar, padding='4', borderwidth=1, relief='solid') console_box.grid(column=0, row=2) console_box.columnconfigure(0, weight=1) console_box.rowconfigure(2, weight=1) # Button to pause the crawler pause_button = ttk.Button(control_box, padding='4', text='Pause') pause_button.grid(column=0, row=0, sticky=(N, S, W)) pause_button.columnconfigure(0, weight=1) pause_button.rowconfigure(0, weight=1) # Button to start the crawler go_button = ttk.Button(control_box, command=main(), padding='4', text='Go') go_button.grid(column=1, row=0, sticky=(N, S)) go_button.columnconfigure(1, weight=1) go_button.rowconfigure(0, weight=1) # Button to stop the crawler stop_button = ttk.Button(control_box, padding='4', text='Stop') stop_button.grid(column=2, row=0, sticky=(N, S, E)) stop_button.columnconfigure(2, weight=1) stop_button.rowconfigure(0, weight=1) # Title for crawler setting area ttk.Label(setting_box, text='Crawler Settings').grid(column=0, row=0, columnspan=4, sticky=(N, S)) # Option to set Overwrite overwrite_check = ttk.Checkbutton(setting_box, text='Overwrite', variable=overwrite) overwrite_check.grid(column=0, row=1, columnspan=2, sticky=W) overwrite_check.columnconfigure(0, weight=1) overwrite_check.rowconfigure(1, weight=1) # Option to set RaiseErrors raise_errors_check = ttk.Checkbutton(setting_box, text='Raise Errors', variable=raise_errors) raise_errors_check.grid(column=0, row=2, columnspan=2, sticky=W) raise_errors_check.columnconfigure(0, weight=1) raise_errors_check.rowconfigure(2, weight=1) # Option to set SavePages save_pages_check = ttk.Checkbutton(setting_box, text='Save Pages', variable=save_pages) save_pages_check.grid(column=0, row=3, columnspan=2, sticky=W) save_pages_check.columnconfigure(0, weight=1) save_pages_check.rowconfigure(3, weight=1) # Option to set ZipFiles zip_files_check = ttk.Checkbutton(setting_box, text='Zip Files', variable=zip_files_) zip_files_check.grid(column=0, row=4, columnspan=2, sticky=W) zip_files_check.columnconfigure(0, weight=1) zip_files_check.rowconfigure(4, weight=1) # Option to set SaveWords save_words_check = ttk.Checkbutton(setting_box, text='Save Words', variable=save_words) save_words_check.grid(column=0, row=5, columnspan=2, sticky=W) save_words_check.columnconfigure(0, weight=1) save_words_check.rowconfigure(5, weight=1) # Field to enter number for SaveCount ttk.Label(setting_box, text='Save Count').grid(column=0, row=6, columnspan=2, sticky=W) save_count_entry = ttk.Entry(setting_box, width=5, textvariable=save_count) save_count_entry.grid(column=0, row=7, sticky=(E, W)) save_count_entry.columnconfigure(0, weight=1) save_count_entry.rowconfigure(7, weight=1) # Field to enter custom headers ttk.Label(setting_box, text='Custom Headers').grid(column=0, row=8, columnspan=2, sticky=W) custom_headers_entry = Text(setting_box, height=3, width=16) custom_headers_entry.grid(column=0, row=9, columnspan=2, sticky=W) custom_headers_entry.columnconfigure(0, weight=1) custom_headers_entry.rowconfigure(9, weight=1) # Field to enter custom starting links ttk.Label(setting_box, text='Start Links').grid(column=0, row=10, columnspan=2, sticky=W) custom_start_links = Text(setting_box, height=2, width=16) custom_start_links.grid(column=0, row=11, columnspan=2, sticky=W) custom_start_links.columnconfigure(0, weight=1) custom_start_links.rowconfigure(11, weight=1) # Button to select todo file get_todo_file_button = ttk.Button(setting_box, text='...', command=get_file) get_todo_file_button.grid(column=2, row=1, sticky=W) get_todo_file_button.columnconfigure(1, weight=1) get_todo_file_button.rowconfigure(2, weight=1) ttk.Label(setting_box, text='TODO File').grid(column=3, row=1, sticky=W) # Button to select done file get_done_file_button = ttk.Button(setting_box, text='...', command=get_file) get_done_file_button.grid(column=2, row=2, sticky=W) get_done_file_button.columnconfigure(2, weight=1) get_done_file_button.rowconfigure(2, weight=1) ttk.Label(setting_box, text='Done File').grid(column=3, row=2, sticky=W) # Button to select bad link file get_bad_file_button = ttk.Button(setting_box, text='...', command=get_file) get_bad_file_button.grid(column=2, row=3, sticky=W) get_bad_file_button.columnconfigure(2, weight=1) get_bad_file_button.rowconfigure(3, weight=1) ttk.Label(setting_box, text='Bad Link File').grid(column=3, row=3, sticky=W) # Button to select word file get_word_file_button = ttk.Button(setting_box, text='...', command=get_file) get_word_file_button.grid(column=2, row=4, sticky=W) get_word_file_button.columnconfigure(2, weight=1) get_word_file_button.rowconfigure(4, weight=1) ttk.Label(setting_box, text='Word File').grid(column=3, row=4, sticky=W) # Field to set MaxNewErrors max_new_error_entry = ttk.Entry(setting_box, width=4, textvariable=max_new_errors) max_new_error_entry.grid(column=2, row=5, sticky=(E, W)) max_new_error_entry.columnconfigure(2, weight=1) max_new_error_entry.rowconfigure(5, weight=1) ttk.Label(setting_box, text='Max New Errors').grid(column=3, row=5, sticky=W) # Field to set MaxHTTPErrors max_http_error_entry = ttk.Entry(setting_box, width=4, textvariable=max_http_errors) max_http_error_entry.grid(column=2, row=6, sticky=(E, W)) max_http_error_entry.columnconfigure(2, weight=1) max_http_error_entry.rowconfigure(6, weight=1) ttk.Label(setting_box, text='Max HTTP Errors').grid(column=3, row=6, sticky=W) # Field to set MaxKnownErrors max_known_errors_entry = ttk.Entry(setting_box, width=4, textvariable=max_known_errors) max_known_errors_entry.grid(column=2, row=7, sticky=(E, W)) max_known_errors_entry.columnconfigure(2, weight=1) max_known_errors_entry.rowconfigure(7, weight=1) ttk.Label(setting_box, text='Max Known Errors').grid(column=3, row=7, sticky=W) # Field to set MaxNewMIMEs max_new_mimes_entry = ttk.Entry(setting_box, width=4, textvariable=max_new_mimes) max_new_mimes_entry.grid(column=2, row=8, sticky=(E, W)) max_new_mimes_entry.columnconfigure(2, weight=1) max_new_mimes_entry.rowconfigure(8, weight=1) ttk.Label(setting_box, text='Max New MIMEs').grid(column=3, row=8, sticky=W)
def GET(): main(e.get(),e2.get())
def test_crawl_mock_pages_with_persistence(self): mock_params = { '/pagina_inicial': ('Pagina Inicial', 'Produto Inicial', 'produto_1/p', 'pagina_2', 'produto_3/p'), '/produto_1/p': ('Pagina Produto 1', 'Produto 1', 'produto_3/p', 'pagina_5', 'pagina_6'), '/produto_2/p': ('Pagina Produto 2', 'Produto 2', 'produto_4/p', 'produto_5/p', 'pagina_1'), '/produto_3/p': ('Pagina Produto 3', 'Produto 3', 'produto_2/p', 'produto_4/p', 'produto_6/p'), '/produto_4/p': ('Pagina Produto 4', 'Produto 4', 'pagina_inicial', 'pagina_6', 'produto_5/p'), '/produto_5/p': ('Pagina Produto 5', 'Produto 5', 'produto_7/p', 'produto_8/p', 'pagina_5'), '/produto_6/p': ('Pagina Produto 6', 'Produto 6', 'pagina_inicial', 'pagina_1', 'produto_5/p'), '/produto_7/p': ('Pagina Produto 6', 'Produto 6', 'pagina_inicial', 'pagina_1', 'produto_5/p'), '/pagina_1': ('Pagina 1', 'Página 1', 'produto_1/p', 'pagina_5', 'produto_3/p'), '/pagina_2': ('Pagina 2', 'Página 2', 'produto_2/p', 'pagina_1', 'pagina_3'), '/pagina_3': ('Pagina 3', 'Página 3', 'produto_6/p', 'pagina_1', 'pagina_inicial'), '/pagina_4': ('Pagina 4', 'Página 4', 'produto_8/p', 'pagina_5', 'produto_3/p'), '/pagina_5': ('Pagina 5', 'Página 5', 'produto_2/p', 'pagina_4', 'produto_4/p'), '/pagina_6': ('Pagina 6', 'Página 6', 'pagina_inicial', 'pagina_5', 'produto_3/p'), } # with patch('crawler.get_page_contents', MockPageGenerator(mock_params)): # crawler.main(['-d', '0', '-o', 'teste.csv', '-r', 'teste.json', '/pagina_inicial']) # expected = [] # self.assertEqual(expected, self.load_result_csv()) # with patch('crawler.get_page_contents', MockPageGenerator(mock_params)): # crawler.main(['-d', '0', '-o', 'teste.csv', '-r', 'teste.json', '/pagina_inicial']) # expected = [['Produto 1', 'Pagina Produto 1', 'http://www.epocacosmeticos.com.br/produto_1/p'], # ['Produto 3', 'Pagina Produto 3', 'http://www.epocacosmeticos.com.br/produto_3/p']] # self.assertEqual(expected, self.load_result_csv()) with patch('crawler.get_page_contents', MockPageGenerator(mock_params)): crawler.main([ '-d', '2', '-o', 'teste.csv', '-r', 'teste.json', '/pagina_inicial' ]) expected = [[ 'Produto 1', 'Pagina Produto 1', 'http://www.epocacosmeticos.com.br/produto_1/p' ], [ 'Produto 3', 'Pagina Produto 3', 'http://www.epocacosmeticos.com.br/produto_3/p' ], [ 'Produto 2', 'Pagina Produto 2', 'http://www.epocacosmeticos.com.br/produto_2/p' ], [ 'Produto 4', 'Pagina Produto 4', 'http://www.epocacosmeticos.com.br/produto_4/p' ], [ 'Produto 6', 'Pagina Produto 6', 'http://www.epocacosmeticos.com.br/produto_6/p' ]] self.assertEqual(expected, self.load_result_csv())
def main(title: str, skip_crawling: bool): title = str(title) if (not skip_crawling): crawler.main(title) print("Start to create video for {}".format(title)) fps = config['animation_fps'] width = config['width'] height = config['height'] test = config['test'] # Paths output_dir = os.sep.join([".", "output"]) if not os.path.exists(output_dir): print("Folder", output_dir, 'does not exist. Creating...') os.makedirs(output_dir) resource_dir = os.sep.join([".", "resource", title]) # Assets result = text_processing.load_data(title) title_font = ImageFont.truetype(config['title_font'], config['title_font_size'], encoding="utf-8") content_font = ImageFont.truetype(config['content_font'], config['content_font_size'], encoding="utf-8") title_wrapper = text_processing.Wrapper(title_font) content_wrapper = text_processing.Wrapper(content_font) audio_clip = AudioFileClip( os.sep.join([".", "resource", title, "audio", title + ".mp3"])) if not os.path.exists(output_dir): print("Folder", output_dir, 'does not exist. Creating...') os.makedirs(output_dir) keys = list(map(int, result.keys())) if 0 not in keys: keys.append(0) keys.append(math.ceil(audio_clip.duration)) keys.sort() #print(keys) video_clips = [] key_length = 10 if test else len(keys) - 1 files = os.listdir(os.sep.join(['.', 'resource', title])) print(files) for i in range(0, key_length): key = str(keys[i]) start = keys[i] end = keys[i + 1] #image_dir = os.sep.join(['.', 'resource', key+result[key]['image_suffix']]) if ((key not in result.keys()) or (key + result[key]['image_suffix'] not in files)): print("Case1") if key == '0': print("Creating title...") frame = image_processing.generate_title_image( os.sep.join(['.', 'resource', title, 'title.jpg']), (width, height)) else: frame = image_processing.generate_blank_frame( "", "", (width, height), title_wrapper, content_wrapper, title_font, content_font) videoclip = video_processing.create_video_with_frame( frame, start, end) video_clips.append(videoclip) else: if (result[key]['image_suffix'].lower() not in [".gif"]): print("Case2") image = os.sep.join([ '.', 'resource', title, str(key) + result[key]['image_suffix'] ]) header = result[key]['header'] content = result[key]['content'] frame = image_processing.generate_frame( image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) videoclip = video_processing.create_video_with_frame( frame, start, end) video_clips.append(videoclip) #os.remove(image) elif (result[key]['image_suffix'].lower() in [".gif"]): print("Case3") image = os.sep.join([ '.', 'resource', title, str(key) + result[key]['image_suffix'] ]) print(image) header = result[key]['header'] content = result[key]['content'] if config['skip_gif']: background_frame = image_processing.generate_blank_frame( header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) videoclip = video_processing.create_video_with_frame( background_frame, start, end) else: gif_clip = video_processing.load_gif_clip(image) background_frame = image_processing.generate_blank_frame( header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) videoclip = video_processing.create_video_with_gif_clip( background_frame, gif_clip, start, end) video_clips.append(videoclip) merged_clips = concatenate_videoclips(video_clips) merged_clips.audio = audio_clip logo_clip = video_processing.load_logo(os.sep.join( [".", "util", config['logo_name']]), duration=merged_clips.duration) if config['enable_logo']: final_clip = video_processing.add_logo(merged_clips, logo_clip) else: final_clip = merged_clips if test: final_clip = video_processing.add_logo( merged_clips, logo_clip).subclip(0, min(50, final_clip.duration)) final_clip.write_videofile(os.sep.join( [".", "output", title + "_animated.mp4"]), fps=fps, threads=4) print(title, "finished!")
import crawler if __name__ == '__main__': crawler.main()
def job(): print('启动定时执行') crawler.main()
# nohup python {{path}} & from flask import Flask, Response, jsonify, render_template import crawler app = Flask(__name__) a, b, t = crawler.main() @app.route("/") def index(): return render_template('index.html', title="hkepc") @app.route("/crawl") def crawl(): global a global b global t a, b, t = crawler.main() return jsonify(rows=b, time=t) @app.route("/api/get") def getData(): return jsonify(rows=b, time=t) @app.route("/json") def jsontest(): list = [ {'param': 'foo', 'val': 2}, {'param': 'bar', 'val': 10} ]
def main(): import crawler crawler.common.DOWNLOAD_THREADS = 10 crawler.main()
# -*- coding: utf-8 -*- import notification.notification as notification import score.score as score import crawler as crawler from time import sleep import re option = notification.main("initial", "") while True: print("option: " + option) contents = crawler.main() print(contents) for article in contents: signal = score.score_with_word(article[0], article[1], option) if signal: notification.main("notify", article[0]) sleep(24*60*60)
def test_crawl_eternity_product_link_not_found(self): url = '/eternity-25th-anniversary-edition-for-women-eau-de-toilette-calvin-klein-perfume-feminino/p' crawler.main(['-d', '0', '-o', 'teste.csv', url]) expected = [] self.assertEqual(expected, self.load_result_csv())
def main(title:str, skip_crawling:bool): title=str(title) if(not skip_crawling): crawler.main(title) print("Start to create video for {}".format(title)) fps = config['fps'] width = config['width'] height = config['height'] # Paths output_dir = os.sep.join([".", "output"]) if not os.path.exists(output_dir): print("Folder", output_dir, 'does not exist. Creating...') os.makedirs(output_dir) resource_dir = os.sep.join([".", "resource", title]) # Assets result = text_processing.load_data(title) title_font = ImageFont.truetype(config['title_font'], config['title_font_size'], encoding="utf-8") content_font = ImageFont.truetype(config['content_font'], config['content_font_size'], encoding="utf-8") title_wrapper = text_processing.Wrapper(title_font) content_wrapper = text_processing.Wrapper(content_font) # Video Properties fourcc = VideoWriter_fourcc(*'mp4v') video = VideoWriter(os.sep.join([output_dir, title+'_simple.mp4']), fourcc, float(fps), (width, height)) # Create Video keys = list(map(int, result.keys())) if 0 not in keys: keys.append(0) frame = image_processing.create_blank_frame("", "", (width, height), title_wrapper, content_wrapper, title_font, content_font) else: key = "0" image = os.sep.join([resource_dir, str(key)+result[key]['image_suffix']]) header = result[key]['header'] content = result[key]['content'] print("标题:{}".format(header)) if(result[key]['image_suffix'] in ['.gif', '.GIF']): frame = image_processing.create_blank_frame(header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) else: frame = image_processing.create_frame(image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) #os.remove(image) keys.sort() # Set last picture to be 20 seconds long keys.append(keys[len(keys)-1]+20) #print(keys) # Number of frames in this video total_length = keys[len(keys)-1]*fps index = 0 for i in range(total_length): if(index+1>len(keys)-1): frame = image_processing.create_blank_frame("","", (width, height), title_wrapper, content_wrapper, title_font, content_font) elif (i/fps)>keys[index+1]: index+=1 print("Processing {} frames out of {}".format(index, len(keys)-1)) key = str(keys[index]) image = os.sep.join([resource_dir, str(key)+result[key]['image_suffix']]) header = result[key]['header'] content = result[key]['content'] print("标题:{}".format(header)) if(result[key]['image_suffix'] in ['.gif', '.GIF']): frame = image_processing.create_blank_frame(header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) else: frame = image_processing.create_frame(image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) #os.remove(image) else: pass video.write(frame) print("{} finished!".format(title))
def test_crawl_invalid_product(self): url = '/invalid-product/p' crawler.main(['-d', '0', '-o', 'teste.csv', url]) expected = [] self.assertEqual(expected, self.load_result_csv())
def test_crawl_home_page_depth_0(self): crawler.main(['-d', '0', '-o', 'teste.csv', '/']) expected = [] self.assertEqual(expected, self.load_result_csv())
def main(title: str, skip_crawling: bool): title = str(title) if (not skip_crawling): crawler.main(title) print("Start to create video for {}".format(title)) fps = config['fps'] width = config['width'] height = config['height'] # Paths output_dir = os.sep.join([".", "output"]) if not os.path.exists(output_dir): print("Folder", output_dir, 'does not exist. Creating...') os.makedirs(output_dir) resource_dir = os.sep.join([".", "resource", title]) # Assets result = text_processing.load_data(title) title_font = ImageFont.truetype(config['title_font'], config['title_font_size'], encoding="utf-8") content_font = ImageFont.truetype(config['content_font'], config['content_font_size'], encoding="utf-8") title_wrapper = text_processing.Wrapper(title_font) content_wrapper = text_processing.Wrapper(content_font) audio_clip = AudioFileClip( os.sep.join([resource_dir, "audio", title + ".mp3"])) # Video Properties fourcc = VideoWriter_fourcc(*'mp4v') video = VideoWriter(os.sep.join([output_dir, title + '_complex_temp.mp4']), fourcc, float(fps), (width, height)) # Create Video keys = list(map(int, result.keys())) if 0 not in keys: keys.append(0) frame = image_processing.generate_cv2_title_image( os.sep.join(['.', 'resource', title, 'title.jpg']), (width, height)) else: key = "0" image = os.sep.join( [resource_dir, str(key) + result[key]['image_suffix']]) header = result[key]['header'] content = result[key]['content'] print("标题:{}".format(header)) if (result[key]['image_suffix'] in ['.gif', '.GIF']): frame = image_processing.generate_cv2_blank_frame( header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) else: frame = image_processing.generate_cv2_frame( image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) #os.remove(image) keys.sort() # Set last picture to be 20 seconds long keys.append(math.ceil(audio_clip.duration)) #print(keys) # Number of frames in this video total_length = (200 if config['test'] else keys[len(keys) - 1]) * fps index = 0 for i in range(total_length): if (index > len(keys) - 1): frame = image_processing.generate_cv2_blank_frame( "", "", (width, height), title_wrapper, content_wrapper, title_font, content_font) elif (i / fps) >= keys[index + 1]: index += 1 print("Processing {} frames out of {}".format( index, len(keys) - 1)) key = str(keys[index]) image = os.sep.join( [resource_dir, str(key) + result[key]['image_suffix']]) header = result[key]['header'] content = result[key]['content'] print("标题:{}".format(header)) if (result[key]['image_suffix'] in ['.gif', '.GIF']): frame = image_processing.generate_cv2_blank_frame( header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) else: frame = image_processing.generate_cv2_frame( image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font) #os.remove(image) else: pass video.write(frame) video.release() video_clip = VideoFileClip( os.sep.join([output_dir, title + "_complex_temp.mp4"])) print(video_clip.duration) video_clip.audio = audio_clip if config['enable_logo']: logo_clip = video_processing.load_logo(os.sep.join( [".", "util", config['logo_name']]), duration=video_clip.duration) video_clip = video_processing.add_logo(video_clip, logo_clip) if config['test']: video_clip = video_clip.subclip(0, min(200, video_clip.duration)) video_clip.write_videofile(os.sep.join( [output_dir, title + "_complex.mp4"]), fps=fps) print("{} finished!".format(title)) os.remove(os.sep.join([output_dir, title + "_complex_temp.mp4"]))
def test_crawl_home_page_depth_1(self): crawler.main(['-d', '1', '-o', 'teste.csv', '/']) self.assertLessEqual(70, len(self.load_result_csv()))