def result(): query = request.args.get('query', '') result = parse_html(query) if result.ok: return render_template('result.html', query=query, result=result) else: return render_template('error.html', query=query, error_msg=result.error_msg)
def main(argv): """Manipulates and outputs data related to NetrunnerDB Hall of Fame.""" valid_deck_args = "<likes|faves|comments|name|user|rep|date>" valid_user_args = "<decks|rep>" help_msg = "nrdb_scraper.py -d " + valid_deck_args + " -u " + valid_user_args invalid_arg = "Invalid argument:" valid_args_are = "Valid args are:" try: opts, args = getopt.getopt(argv, "hd:u:", ["decks=", "users="]) except getopt.GetoptError: print(help_msg) sys.exit(2) for opt, arg in opts: if opt == '-h': print(help_msg) sys.exit() else: decks = parse_html() if opt in ("-d", "--decks"): try: decks = getattr(sort, "sort_by_" + arg)(decks) output_decks(decks) except AttributeError: print(invalid_arg, '"' + opt, arg + '"') print(valid_args_are, valid_deck_args) sys.exit() if opt in ("-u", "--users"): try: users = getattr(sort, "sort_by_most_" + arg)(decks) output_users(users) except AttributeError: print(invalid_arg, '"' + opt, arg + '"') print(valid_args_are, valid_user_args) sys.exit()
def main(): try: while True: user = rh.get_from_prepare() if not user: print 'completed' return customer_id, name, id_card = user print customer_id if rh.check_completed(customer_id): continue html_content = bjh_spider(name, id_card) if html_content: data = parse_html(html_content, customer_id) if data: data['id_card'] = id_card data['customer_id'] = customer_id mh.save_data(data) rh.save_to_completed(customer_id) # 现在只有顺利抓取了内容才存入已爬表 else: log = 'error happen, customer_id is %s' % customer_id logger.warning(log) time.sleep(2) except: logger.warning(traceback.format_exc())
def add(): form = AddItemForm() if form.validate_on_submit(): item = Item(link=form.link.data) # todo add error handler. # item.parse_html() if form.tags.data: tags = form.tags.data.split(',') for tag in tags: if Tag.query.filter_by(name=tag).first(): item.tags.append(Tag.query.filter_by(name=tag).first()) else: item.tags.append(Tag(name=tag)) current_user.items.append(item) db.session.add(current_user) db.session.commit() parse_html(item.id) flash(u'已添加新条目:「{} 」。'.format(form.link.data), 'success') return redirect(url_for('.index')) return render_template('add.html', form=form)
def main(): cwd = os.getcwd() parser = argparse.ArgumentParser() parser.add_argument( '--current-dir', default=cwd, help='The current working directory where this script is being run.') parser.add_argument( '--url-limit', default=200, type=int, help='Set limit for the amount of URLs to parse. Default=%(default)s') parser.add_argument( '--true-k', default=5, help='Number of clusers to create from the user\'s history') parser.add_argument( "--lsa", action="store_true", default="True", help="Preprocess documents with latent semantic analysis.") parser.add_argument("--no-minibatch", action="store_false", dest="minibatch", default=True, help="Use ordinary k-means algorithm (in batch mode).") parser.add_argument("--use-hashing", action="store_true", default=False, help="Use a hashing feature vectorizer") parser.add_argument( "--n-features", type=int, default=10000, help="Maximum number of features (dimensions) to extract from text.") parser.add_argument( "--verbose", action="store_true", default=False, help="logging.info progress reports inside k-means algorithm.") args = parser.parse_args() urls = parse_html.get_urls(args.current_dir, args.url_limit) text_docs = parse_html.parse_html(urls) logging.info('---------------------------') logging.info(text_docs.keys()) doc_clusters, doc_cluster_terms, train_vectorizer, lsa = cluster_docs( text_docs, args, args.true_k) logging.info("----------------------------") logging.info("COMPLETE")
def tokenize_http_homepage(host:lxml.etree.ElementTree)->str: try: homepage = host.xpath(".//script[@id='http-homepage']")[0] except: raise StopIteration header = homepage.xpath(".//table[@key='response_header']")[0] yield from parse_response_header(header) try: body = homepage.xpath(".//elem[@key='response_body']")[0].text except: raise StopIteration if body is None: raise StopIteration htmlcode=html_unescape_backslash_hex(body) yield from parse_html(htmlcode)
def tokenize_http_homepage(host: lxml.etree.ElementTree) -> str: try: homepage = host.xpath(".//script[@id='http-homepage']")[0] except: raise StopIteration header = homepage.xpath(".//table[@key='response_header']")[0] yield from parse_response_header(header) try: body = homepage.xpath(".//elem[@key='response_body']")[0].text except: raise StopIteration if body is None: raise StopIteration htmlcode = html_unescape_backslash_hex(body) yield from parse_html(htmlcode)
def do_POST(self): """Handle POST request to use client JSON data to recommend links""" self._set_headers() logging.info("INFO: POST request received...") self.data_string = self.rfile.read(int(self.headers['Content-Length'])) # NOTE: data is of type list data = simplejson.loads(self.data_string) data_documents = parse_html.parse_html(data) recommended_links = cluster_documents.compare_items_to_cluster( doc_clusters, data_documents, args, vectorizer, lsa) json_response = simplejson.dumps(recommended_links) with open(RECOMMENDED_LINKS, "w") as outfile: simplejson.dump(json_response, outfile) logging.info("INFO: Links to recommend: %s" % json_response) self.send_response(200) self.end_headers()
def reiteration( url ): #added this function because in class he said the code shouldnt just exit, it should keep asking about transforms until you exit out of it transform_type = input( "\nEnter Corresponding Number for Transformation: \n" " 0 for Original Recipe \n" " 1 for transformation to healthy \n" " 2 for transformation unhealthy \n" " 3 for transformation to vegetarian \n" " 4 for transformation to non-vegetarian \n" " 5 for transformation to Style of Cuisine: Sicilian Cuisine \n" " 6 for transformation to Style of Cuisine: Korean Cuisine \n" " 7 to enter a url for a different recipe\n" " 8 to Exit \n") # 6 for transformation to Style of Cuisine: Indonesian Cuisine # display_type = input("Enter 0 to view the full recipe, 1 to view the ingredients list, 2 to view all required tools, 3 to view all methods, 4 to view all steps.\n") recipe_dict = parse_html(url) name = recipe_dict['name'] ingredients = recipe_dict['ingredients'] #parse ingredients list parsed_ingredients = parse_ingredient_list(ingredients) #split directions, steps should be split by sentences NOT the actual steps described in allrecipes directions = [] temp_directions = recipe_dict['directions'] for direc in temp_directions: split_direc = direc.split('. ') directions += split_direc #get list of all ingredient names all_ingredients = [item['name'] for item in parsed_ingredients] #parse directions parsed_directions = [] prevtools = [] for step in directions: parsed_step = {} parsed_step['original'] = step parsed_step['times'] = get_steptimes(step) parsed_step['method'] = get_cooking_method(step) parsed_step['ingredients'] = get_ingredients_in_step( step, all_ingredients) parsed_step['tools'], prevtools = get_tools(step, prevtools) parsed_directions.append(parsed_step) # print('parsed directions: ' , parsed_directions) # print('parsed ingredients: ', parsed_ingredients) #original recipe if transform_type == "0": readable = readable_recipe(name, parsed_ingredients, parsed_directions) for item in readable: print(item) reiteration(url) #transform to healthy elif transform_type == "1": recipe_name, p_ingredients, p_directions = transform_healthy( name, parsed_ingredients, parsed_directions) readable = readable_recipe(recipe_name, p_ingredients, p_directions) for item in readable: print(item) reiteration(url) #transform to unhealthy elif transform_type == "2": recipe_names, p_ingredient, p_direction = transform_unhealthy( name, parsed_ingredients, parsed_directions) readable = readable_recipe(recipe_names, p_ingredient, p_direction) for item in readable: print(item) reiteration(url) #transform to vegetarian elif transform_type == "3": recipe_names, p_ingredient, p_direction = transform_vegetarian( name, parsed_ingredients, parsed_directions) readable = readable_recipe(recipe_names, p_ingredient, p_direction) for item in readable: print(item) reiteration(url) #transform to non-vegetarian elif transform_type == "4": recipe_names, p_ingredient, p_direction = transform_nonvegetarian( name, parsed_ingredients, parsed_directions) readable = readable_recipe(recipe_names, p_ingredient, p_direction) for item in readable: print(item) reiteration(url) #transform to Sicilian Cuisine elif transform_type == "5": recipe_names, p_ingredient, p_direction = transform_siciliancuisine( name, parsed_ingredients, parsed_directions) readable = readable_recipe(recipe_names, p_ingredient, p_direction) print('\nSicilian Style Cuisine:') for item in readable: print(item) reiteration(url) #transform to Korean Cuisine elif transform_type == "6": recipe_names, p_ingredient, p_direction = transform_korean( name, parsed_ingredients, parsed_directions) readable = readable_recipe(recipe_names, p_ingredient, p_direction) print('\nKorean Style Cuisine:') for item in readable: print(item) reiteration(url) elif transform_type == "7": main() #exit out of the code elif transform_type == '8': print("Now exiting...")
default=False, help="Use a hashing feature vectorizer") parser.add_argument( "--n-features", type=int, default=10000, help="Maximum number of features (dimensions) to extract from text.") parser.add_argument( "--verbose", action="store_true", default=False, help="logging.info progress reports inside k-means algorithm.") args = parser.parse_args() print(args) print("\n") # TODO: <AFTER_THOUGHT> add some parameter to switch between different functionality of the script generate_cert.create_cert() get_history.copy_chrome_history(args.file_path, args.current_dir) t_now = time.time() urls = parse_html.get_urls(args.current_dir, args.url_limit) text_docs = parse_html.parse_html(urls) t_after_parse = (time.time() - t_now) / 60.0 logging.info("Parsing took: %0.3f" % t_after_parse) doc_clusters, doc_cluster_terms, vectorizer, lsa = cluster_documents.cluster_docs( text_docs, args) t_after_clustering = (time.time() - t_after_parse) / 60.0 logging.info("Clustering took: %0.3f" % t_after_clustering) logging.info("History collected, parsed and ready for recommendations.") get_history.open_chrome(args.chrome_path, args.chrome_url) run()
nam_aka_loc = sys.argv[3] tit_aka_loc = sys.argv[4] tit_loc = sys.argv[5] # final_output is the file you want the final results to be put in final_output = sys.argv[6] # connect to database (change details to connect to different db) db = DB.connect(host='', user='', passwd='', db='') cursor = db.cursor() import_tables.create_tables(db, cursor) import_tables.insert_data(db, cursor, act_loc, "actors") import_tables.insert_data(db, cursor, dir_loc, "directors") import_tables.insert_data(db, cursor, nam_aka_loc, "names_aka") import_tables.insert_data(db, cursor, tit_aka_loc, "titles_aka") import_tables.insert_data(db, cursor, tit_loc, "titles") parse_html.parse_html(db, cursor) match_movies.match_movies(db, cursor) match_movies.create_result(db, cursor, final_output) db.commit() db.close() print "script completed successfully :)"
# these are file locations for files containing actor data, director data, name_aka data, # titles_aka data, titles data. act_loc = sys.argv[1] dir_loc = sys.argv[2] nam_aka_loc = sys.argv[3] tit_aka_loc = sys.argv[4] tit_loc = sys.argv[5] # final_output is the file you want the final results to be put in final_output = sys.argv[6] # connect to database (change details to connect to different db) db = DB.connect(host='', user='', passwd='', db='') cursor = db.cursor() import_tables.create_tables(db, cursor) import_tables.insert_data(db, cursor, act_loc, "actors") import_tables.insert_data(db, cursor, dir_loc, "directors") import_tables.insert_data(db, cursor, nam_aka_loc, "names_aka") import_tables.insert_data(db, cursor, tit_aka_loc, "titles_aka") import_tables.insert_data(db, cursor, tit_loc, "titles") parse_html.parse_html(db, cursor) match_movies.match_movies(db, cursor) match_movies.create_result(db, cursor, final_output) db.commit() db.close() print "script completed successfully :)"
import parse_html as ph NUM = 5 urls = gh.get_url(NUM) pool = ThreadPool(4) htmls = pool.map(gh.get_info, urls[100:150]) pool.close() pool.join() names = [] creditors = [] areas = [] moneys = [] dates = [] for html in htmls: if ph.judge(html) is "可以正常解析数据!": name, creditor, area, money, date = ph.parse_html(html) print(name) print(creditor) print(area) print(money) print(date) names.append(name) creditors.append(creditor) areas.append(area) moneys.append(money) dates.append(date)
def test_parse_sfacg_ok(self): ok_url = 'http://comic.sfacg.com/HTML/XFGJ/001j/' r = parse_html.parse_html(ok_url) self.assertTrue(r.ok)
def test_parse_sfacg_fail(self): fail_url = 'http://comic.sfacg.com/HTML/XFGJ/001jsdf/' r = parse_html.parse_html(fail_url) self.assertFalse(r.ok)
def test_parse_8comic_ok(self): ok_url = 'http://new.comicvip.com/show/cool-7340.html?ch=54' r = parse_html.parse_html(ok_url) self.assertTrue(r.ok)
def test_parse_sfacg_name(self): ok_url = 'http://comic.sfacg.com/HTML/XFGJ/001j/' r = parse_html.parse_html(ok_url) self.assertEqual(unicode, type(r.data['name']))
def test_parse_sfacg_head(self): head_url = 'http://comic.sfacg.com/HTML/XFGJ/001j/' r = parse_html.parse_html(head_url) self.assertIsNone(r.data['prev_url']) self.assertIsNotNone(r.data['next_url'])