Example #1
0
def result():
    query = request.args.get('query', '')
    result = parse_html(query)
    if result.ok:
        return render_template('result.html', query=query, result=result)
    else:
        return render_template('error.html', query=query, error_msg=result.error_msg)
def main(argv):
    """Manipulates and outputs data related to NetrunnerDB Hall of Fame."""
    valid_deck_args = "<likes|faves|comments|name|user|rep|date>"
    valid_user_args = "<decks|rep>"
    help_msg = "nrdb_scraper.py -d " + valid_deck_args + " -u " + valid_user_args
    invalid_arg = "Invalid argument:"
    valid_args_are = "Valid args are:"

    try:
        opts, args = getopt.getopt(argv, "hd:u:", ["decks=", "users="])
    except getopt.GetoptError:
        print(help_msg)
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(help_msg)
            sys.exit()
        else:
            decks = parse_html()
            if opt in ("-d", "--decks"):
                try:
                    decks = getattr(sort, "sort_by_" + arg)(decks)
                    output_decks(decks)
                except AttributeError:
                    print(invalid_arg, '"' + opt, arg + '"')
                    print(valid_args_are, valid_deck_args)
                    sys.exit()
            if opt in ("-u", "--users"):
                try:
                    users = getattr(sort, "sort_by_most_" + arg)(decks)
                    output_users(users)
                except AttributeError:
                    print(invalid_arg, '"' + opt, arg + '"')
                    print(valid_args_are, valid_user_args)
                    sys.exit()
Example #3
0
def main():
    try:
        while True:
            user = rh.get_from_prepare()
            if not user:
                print 'completed'
                return
            customer_id, name, id_card = user
            print customer_id
            if rh.check_completed(customer_id):
                continue
            html_content = bjh_spider(name, id_card)
            if html_content:
                data = parse_html(html_content, customer_id)
                if data:
                    data['id_card'] = id_card
                    data['customer_id'] = customer_id
                    mh.save_data(data)
                rh.save_to_completed(customer_id)  # 现在只有顺利抓取了内容才存入已爬表
            else:
                log = 'error happen, customer_id is %s' % customer_id
                logger.warning(log)
            time.sleep(2)
    except:
        logger.warning(traceback.format_exc())
Example #4
0
def add():
    form = AddItemForm()
    if form.validate_on_submit():
        item = Item(link=form.link.data)
        # todo add error handler.
        # item.parse_html()
        if form.tags.data:
            tags = form.tags.data.split(',')
            for tag in tags:
                if Tag.query.filter_by(name=tag).first():
                    item.tags.append(Tag.query.filter_by(name=tag).first())
                else:
                    item.tags.append(Tag(name=tag))
        current_user.items.append(item)
        db.session.add(current_user)
        db.session.commit()
        parse_html(item.id)
        flash(u'已添加新条目:「{} 」。'.format(form.link.data), 'success')
        return redirect(url_for('.index'))
    return render_template('add.html', form=form)
def main():
    cwd = os.getcwd()

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--current-dir',
        default=cwd,
        help='The current working directory where this script is being run.')
    parser.add_argument(
        '--url-limit',
        default=200,
        type=int,
        help='Set limit for the amount of URLs to parse. Default=%(default)s')
    parser.add_argument(
        '--true-k',
        default=5,
        help='Number of clusers to create from the user\'s history')
    parser.add_argument(
        "--lsa",
        action="store_true",
        default="True",
        help="Preprocess documents with latent semantic analysis.")
    parser.add_argument("--no-minibatch",
                        action="store_false",
                        dest="minibatch",
                        default=True,
                        help="Use ordinary k-means algorithm (in batch mode).")
    parser.add_argument("--use-hashing",
                        action="store_true",
                        default=False,
                        help="Use a hashing feature vectorizer")
    parser.add_argument(
        "--n-features",
        type=int,
        default=10000,
        help="Maximum number of features (dimensions) to extract from text.")
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="logging.info progress reports inside k-means algorithm.")
    args = parser.parse_args()

    urls = parse_html.get_urls(args.current_dir, args.url_limit)
    text_docs = parse_html.parse_html(urls)
    logging.info('---------------------------')
    logging.info(text_docs.keys())
    doc_clusters, doc_cluster_terms, train_vectorizer, lsa = cluster_docs(
        text_docs, args, args.true_k)
    logging.info("----------------------------")
    logging.info("COMPLETE")
Example #6
0
def tokenize_http_homepage(host:lxml.etree.ElementTree)->str:
    try:
        homepage = host.xpath(".//script[@id='http-homepage']")[0]
    except:
        raise StopIteration
    header = homepage.xpath(".//table[@key='response_header']")[0]
    yield from parse_response_header(header)
    try:
        body = homepage.xpath(".//elem[@key='response_body']")[0].text
    except:
        raise StopIteration
    if body is None:
        raise StopIteration
    htmlcode=html_unescape_backslash_hex(body)
    yield from parse_html(htmlcode)
Example #7
0
def tokenize_http_homepage(host: lxml.etree.ElementTree) -> str:
    try:
        homepage = host.xpath(".//script[@id='http-homepage']")[0]
    except:
        raise StopIteration
    header = homepage.xpath(".//table[@key='response_header']")[0]
    yield from parse_response_header(header)
    try:
        body = homepage.xpath(".//elem[@key='response_body']")[0].text
    except:
        raise StopIteration
    if body is None:
        raise StopIteration
    htmlcode = html_unescape_backslash_hex(body)
    yield from parse_html(htmlcode)
Example #8
0
 def do_POST(self):
     """Handle POST request to use client JSON data to recommend links"""
     self._set_headers()
     logging.info("INFO: POST request received...")
     self.data_string = self.rfile.read(int(self.headers['Content-Length']))
     # NOTE: data is of type list
     data = simplejson.loads(self.data_string)
     data_documents = parse_html.parse_html(data)
     recommended_links = cluster_documents.compare_items_to_cluster(
         doc_clusters, data_documents, args, vectorizer, lsa)
     json_response = simplejson.dumps(recommended_links)
     with open(RECOMMENDED_LINKS, "w") as outfile:
         simplejson.dump(json_response, outfile)
     logging.info("INFO: Links to recommend: %s" % json_response)
     self.send_response(200)
     self.end_headers()
Example #9
0
def reiteration(
    url
):  #added this function because in class he said the code shouldnt just exit, it should keep asking about transforms until you exit out of it
    transform_type = input(
        "\nEnter Corresponding Number for Transformation: \n"
        " 0 for Original Recipe \n"
        " 1 for transformation to healthy \n"
        " 2 for transformation unhealthy \n"
        " 3 for transformation to vegetarian \n"
        " 4 for transformation to non-vegetarian \n"
        " 5 for transformation to Style of Cuisine: Sicilian Cuisine \n"
        " 6 for transformation to Style of Cuisine: Korean Cuisine \n"
        " 7 to enter a url for a different recipe\n"
        " 8 to Exit \n")
    # 6 for transformation to Style of Cuisine: Indonesian Cuisine
    # display_type = input("Enter 0 to view the full recipe, 1 to view the ingredients list, 2 to view all required tools, 3 to view all methods, 4 to view all steps.\n")

    recipe_dict = parse_html(url)
    name = recipe_dict['name']
    ingredients = recipe_dict['ingredients']

    #parse ingredients list
    parsed_ingredients = parse_ingredient_list(ingredients)

    #split directions, steps should be split by sentences NOT the actual steps described in allrecipes
    directions = []
    temp_directions = recipe_dict['directions']
    for direc in temp_directions:
        split_direc = direc.split('. ')
        directions += split_direc

    #get list of all ingredient names
    all_ingredients = [item['name'] for item in parsed_ingredients]

    #parse directions
    parsed_directions = []
    prevtools = []
    for step in directions:
        parsed_step = {}
        parsed_step['original'] = step
        parsed_step['times'] = get_steptimes(step)
        parsed_step['method'] = get_cooking_method(step)
        parsed_step['ingredients'] = get_ingredients_in_step(
            step, all_ingredients)
        parsed_step['tools'], prevtools = get_tools(step, prevtools)

        parsed_directions.append(parsed_step)

    # print('parsed directions: ' , parsed_directions)
    # print('parsed ingredients: ', parsed_ingredients)

    #original recipe
    if transform_type == "0":
        readable = readable_recipe(name, parsed_ingredients, parsed_directions)
        for item in readable:
            print(item)
        reiteration(url)

    #transform to healthy
    elif transform_type == "1":
        recipe_name, p_ingredients, p_directions = transform_healthy(
            name, parsed_ingredients, parsed_directions)
        readable = readable_recipe(recipe_name, p_ingredients, p_directions)
        for item in readable:
            print(item)
        reiteration(url)

    #transform to unhealthy
    elif transform_type == "2":
        recipe_names, p_ingredient, p_direction = transform_unhealthy(
            name, parsed_ingredients, parsed_directions)
        readable = readable_recipe(recipe_names, p_ingredient, p_direction)
        for item in readable:
            print(item)
        reiteration(url)

    #transform to vegetarian
    elif transform_type == "3":
        recipe_names, p_ingredient, p_direction = transform_vegetarian(
            name, parsed_ingredients, parsed_directions)
        readable = readable_recipe(recipe_names, p_ingredient, p_direction)
        for item in readable:
            print(item)
        reiteration(url)

    #transform to non-vegetarian
    elif transform_type == "4":
        recipe_names, p_ingredient, p_direction = transform_nonvegetarian(
            name, parsed_ingredients, parsed_directions)
        readable = readable_recipe(recipe_names, p_ingredient, p_direction)
        for item in readable:
            print(item)
        reiteration(url)

    #transform to Sicilian Cuisine
    elif transform_type == "5":
        recipe_names, p_ingredient, p_direction = transform_siciliancuisine(
            name, parsed_ingredients, parsed_directions)
        readable = readable_recipe(recipe_names, p_ingredient, p_direction)
        print('\nSicilian Style Cuisine:')
        for item in readable:
            print(item)
        reiteration(url)

    #transform to Korean Cuisine
    elif transform_type == "6":
        recipe_names, p_ingredient, p_direction = transform_korean(
            name, parsed_ingredients, parsed_directions)
        readable = readable_recipe(recipe_names, p_ingredient, p_direction)
        print('\nKorean Style Cuisine:')
        for item in readable:
            print(item)
        reiteration(url)

    elif transform_type == "7":
        main()

    #exit out of the code
    elif transform_type == '8':
        print("Now exiting...")
Example #10
0
                        default=False,
                        help="Use a hashing feature vectorizer")
    parser.add_argument(
        "--n-features",
        type=int,
        default=10000,
        help="Maximum number of features (dimensions) to extract from text.")
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="logging.info progress reports inside k-means algorithm.")
    args = parser.parse_args()
    print(args)
    print("\n")
    # TODO: <AFTER_THOUGHT> add some parameter to switch between different functionality of the script
    generate_cert.create_cert()
    get_history.copy_chrome_history(args.file_path, args.current_dir)
    t_now = time.time()
    urls = parse_html.get_urls(args.current_dir, args.url_limit)
    text_docs = parse_html.parse_html(urls)
    t_after_parse = (time.time() - t_now) / 60.0
    logging.info("Parsing took: %0.3f" % t_after_parse)
    doc_clusters, doc_cluster_terms, vectorizer, lsa = cluster_documents.cluster_docs(
        text_docs, args)
    t_after_clustering = (time.time() - t_after_parse) / 60.0
    logging.info("Clustering took: %0.3f" % t_after_clustering)
    logging.info("History collected, parsed and ready for recommendations.")
    get_history.open_chrome(args.chrome_path, args.chrome_url)
    run()
Example #11
0
nam_aka_loc = sys.argv[3]
tit_aka_loc = sys.argv[4]
tit_loc = sys.argv[5]
# final_output is the file you want the final results to be put in
final_output = sys.argv[6]

# connect to database (change details to connect to different db)
db = DB.connect(host='',
                user='',
                passwd='',
                db='')
cursor = db.cursor()

import_tables.create_tables(db, cursor)
import_tables.insert_data(db, cursor, act_loc, "actors")
import_tables.insert_data(db, cursor, dir_loc, "directors")
import_tables.insert_data(db, cursor, nam_aka_loc, "names_aka")
import_tables.insert_data(db, cursor, tit_aka_loc, "titles_aka")
import_tables.insert_data(db, cursor, tit_loc, "titles")

parse_html.parse_html(db, cursor)

match_movies.match_movies(db, cursor)
match_movies.create_result(db, cursor, final_output)

db.commit()
db.close()

print "script completed successfully :)"

Example #12
0
# these are file locations for files containing actor data, director data, name_aka data,
# titles_aka data, titles data.
act_loc = sys.argv[1]
dir_loc = sys.argv[2]
nam_aka_loc = sys.argv[3]
tit_aka_loc = sys.argv[4]
tit_loc = sys.argv[5]
# final_output is the file you want the final results to be put in
final_output = sys.argv[6]

# connect to database (change details to connect to different db)
db = DB.connect(host='', user='', passwd='', db='')
cursor = db.cursor()

import_tables.create_tables(db, cursor)
import_tables.insert_data(db, cursor, act_loc, "actors")
import_tables.insert_data(db, cursor, dir_loc, "directors")
import_tables.insert_data(db, cursor, nam_aka_loc, "names_aka")
import_tables.insert_data(db, cursor, tit_aka_loc, "titles_aka")
import_tables.insert_data(db, cursor, tit_loc, "titles")

parse_html.parse_html(db, cursor)

match_movies.match_movies(db, cursor)
match_movies.create_result(db, cursor, final_output)

db.commit()
db.close()

print "script completed successfully :)"
Example #13
0
import parse_html as ph

NUM = 5
urls = gh.get_url(NUM)
pool = ThreadPool(4)
htmls = pool.map(gh.get_info, urls[100:150])
pool.close()
pool.join()

names = []
creditors = []
areas = []
moneys = []
dates = []

for html in htmls:
    if ph.judge(html) is "可以正常解析数据!":
        name, creditor, area, money, date = ph.parse_html(html)

        print(name)
        print(creditor)
        print(area)
        print(money)
        print(date)

        names.append(name)
        creditors.append(creditor)
        areas.append(area)
        moneys.append(money)
        dates.append(date)
Example #14
0
 def test_parse_sfacg_ok(self):
     ok_url = 'http://comic.sfacg.com/HTML/XFGJ/001j/'
     r = parse_html.parse_html(ok_url)
     self.assertTrue(r.ok)
Example #15
0
 def test_parse_sfacg_fail(self):
     fail_url = 'http://comic.sfacg.com/HTML/XFGJ/001jsdf/'
     r = parse_html.parse_html(fail_url)
     self.assertFalse(r.ok)
Example #16
0
 def test_parse_8comic_ok(self):
     ok_url = 'http://new.comicvip.com/show/cool-7340.html?ch=54'
     r = parse_html.parse_html(ok_url)
     self.assertTrue(r.ok)
Example #17
0
 def test_parse_sfacg_name(self):
     ok_url = 'http://comic.sfacg.com/HTML/XFGJ/001j/'
     r = parse_html.parse_html(ok_url)
     self.assertEqual(unicode, type(r.data['name']))
Example #18
0
 def test_parse_sfacg_head(self):
     head_url = 'http://comic.sfacg.com/HTML/XFGJ/001j/'
     r = parse_html.parse_html(head_url)
     self.assertIsNone(r.data['prev_url'])
     self.assertIsNotNone(r.data['next_url'])