Beispiel #1
0
def main():
    print("[+] Starting pokemon scraper bot")

    print("[+] Loading ./config.json file")
    with open("config.json") as json_data_file:
        config = json.load(json_data_file)
    print("[+] Success config loaded")

    driver = bot.web.gen_driver()
    raw_data = bot.pokemon.gather_base_pokemon(driver)
    data = bot.pokemon.gather_specific_pokemon_data(driver, raw_data,
                                                    config["client_id"])

    print("[+] Connecting to db " + config["db_name"])
    db_connection = db.connect_to_db(config["db_name"])
    print("[+] Success connected to db")

    print("[+] Adding items to " + config["collection_name"])
    for item in data:
        db.insert_item(db_connection, config["collection_name"], item)
    print("[+] Success all items have been added")

    print("[+] Scraper bot complete")
import wowhead_scrape, sys, pickle, db

tsm_data = pickle.load(open(sys.argv[1], "rb"))
for item_id in tsm_data:
    if db.get_item(item_id) is None:
        print("Scanning item " + item_id)
        item = wowhead_scrape.scrape_item(item_id)
        print(item)
        db.insert_item(item)
        print('Done')
        sys.stdout.flush()
Beispiel #3
0
    jobs = q.getjobs()
    while jobs:
        for job in jobs:
            print 'job', job
            gpool.spawn(handle, job, queue=q)
        gpool.join()
        jobs = q.getjobs()


def handle(job, *args, **kwargs):
    queue = kwargs['queue']
    task = json.loads(job)
    url = task["url"]
    status, source = fetcher.fetch(url, use_proxy=False)
    logger.info('%s|%s' % (url, status))
    try:
        _, ucontent = encoding.html_to_unicode('', source)
    except Exception, e:
        print e
    item = extracter.extract_sohutv_data_by_regex(url, ucontent)
    db.insert_item(item)
    urls = extracter.extract_sohutv(url, source)
    for i in urls:
        if queue.check_fetched(config.BITMAP, i):
            continue
        queue.lpush('{"url": "%s"}' % i)
    return urls

if __name__ == '__main__':
    work()
Beispiel #4
0
            s['name'],
            'url':
            clean_url(domain),
            'socialScore':
            get_popularity('http://' + url)['totalScore'],
            'worldRank':
            world_rank,
            'countryRank':
            country_rank,
            'aboutUsUrl':
            '' if meta['aboutUsUrl'] in ['http://#', 'https://#'
                                         ] else meta['aboutUsUrl'],
            'contactUsUrl':
            '' if meta['contactUsUrl'] in ['http://#', 'https://#'] else
            meta['contactUsUrl'],
            'faviconUrl':
            '',
            'title':
            '',
            "verifiedByUserId":
            "c7d23f36-ca28-53fc-bbeb-5de9a4b05d6a",
            'timestamp':
            r.now().in_timezone('+08:00')
        }

        print(clean_url(domain))

        insert_item(info, 'sources')

        time.sleep(1)
Beispiel #5
0
 print("Please login before continue.")
 username = input("Enter username: "******"Enter password: "******"admin":
         choice = get_admins_option()
         if choice == 1:
             items = db.get_items()
             for x in items:
                 print("Item Name: {} \t Item Price: {}".format(x[1], x[2]))
         if choice == 2:
             item_name = input("Enter product name: ")
             price = int(input("Enter price: "))
             add_product = db.insert_item(item_name, price, user[0])
             if add_product:
                 print("Product added successfully")
         if choice == 3:
             item_id = input("Enter ID to delete item: ")
             response = db.remove_item(item_id)
             if response:
                 print("Item removed successfully!!!!!")
         if choice == 4:
             order_report = db.get_all_orders()
             print("ID \t items \t user \t Total Amount")
             for x in order_report:
                 id = x[0]
                 item_name = get_item_name(db, x[1])
                 usersname = get_users_name(db, x[3])
                 print(x[0], "\t", item_name, "\t", usersname, "\t", x[4])
Beispiel #6
0
    def load(self, file_name):
        """Loads a CSV file."""

        for table in tables:
            db.add_table(tables[table])

        body_data = []
        with open(file_name, encoding='utf-8') as csv_file:
            reader = csv.reader(csv_file, delimiter=',')
            # skip header
            next(reader)
            # read body data
            for row in reader:
                body_data.append(row)

        # header data is of the form; this is manually taken from the csv file
        # 0: 'id'
        # 1: 'media_type'
        # 2: 'name'
        # 3: 'short_name'
        # 4: 'long_description'
        # 5: 'short_description'
        # 6: 'created_at'
        # 7: 'updated_at'
        # 8: 'review_url'
        # 9: 'review_score'
        # 10: 'slug'
        # 11: 'genres'
        # 12: 'created_by'
        # 13: 'published_by'
        # 14: 'franchises'
        # 15: 'regions'

        # for the strings, the following characters must be escaped: < > ' " &
        # for header indices 11 through 15, these are lists of comma separated data surrounded by curly braces
        # these must be separated and put in the appropriate MySQL table

        # process each row
        for row in body_data:
            # id is skipped as a new id will be created by MySQL
            # media type is a simple string: 'Movie', 'Show', 'Comic'
            media_type = row[1]
            # full name of the item
            name = html.escape(row[2])
            # shortened name of the item
            short_name = html.escape(row[3])
            # short and long descriptions are both raw HTML elements and must be escaped for safety
            long_desc = html.escape(row[4])
            short_desc = html.escape(row[5])
            # create/update timestamps are not extracted as they will be automatically inserted by MySQL
            # the link is escaped as well for safety; though this may cause corruption
            review_url = html.escape(row[8])
            # the review score is a decimal number from 0.0 to 10.0
            review_score = float(row[9])
            # extract the slug
            slug = html.escape(row[10])
            # parse the list of genres
            genres = self.parse_list(row[11])
            # parse the list of studios
            studios = self.parse_list(row[12])
            # parse the list of publishers
            publishers = self.parse_list(row[13])
            # parse the list of franchises
            franchises = self.parse_list(row[14])
            # parse the list of regions
            regions = self.parse_list(row[15])

            # check if this item already exists by name
            if db.get_one_item(name=name) != None:
                continue

            item_id = db.insert_item(media_type, name, short_name, long_desc,
                                     short_desc, review_url, review_score,
                                     slug)
            db.insert_genre_item_joins(genres, item_id)
            db.insert_studio_item_joins(studios, item_id)
            db.insert_publisher_item_joins(publishers, item_id)
            db.insert_franchise_item_joins(franchises, item_id)
            db.insert_region_item_joins(regions, item_id)
Beispiel #7
0
                'crawlerName': 'credible crawler'
            })

            if re.search('/(category|gallery|photos?)/', article.url,
                         re.IGNORECASE):
                if PY_ENV == 'development':
                    print('\n(NOT AN ARTICLE PAGE) Skipped: ' +
                          str(article.url) + '\n')
                slp_time = insert_log(
                    source_id, 'articleCrawl', 'error',
                    float(time.clock() - start_time), {
                        'articleUrl': article.url,
                        'errorMessage': 'NOT AN ARTICLE PAGE',
                        'crawlerName': 'credible crawler'
                    })
                insert_item({'id': article.id}, 'errorArticles')
                continue

            try:
                article.download()
                article.parse()

                title = article.title
                title_split = article.title.split('|')

                if len(title_split) != 1:
                    title = title_split[0].strip()

                pattern = re.compile(source.brand, re.IGNORECASE)
                body = pattern.sub('', article.text)
                categories = categorize(body)
Beispiel #8
0
    rows = ''
    panels = ''
    panel_number = 1
    id_number = 1
    r_number = 0
    # db.add_category_table()    # fill category table, call one time
    parser = Parser()
    for category in DATA['categories']:
        category_ids = parser.get_category_ids(category)
        for item in category_ids:
            item_detail = parser.get_item_details(item)
            title = item_detail['title'].replace('\'', '"')
            # Try insert data in "item" table
            try:
                db.insert_item(category, item_detail['id'], item_detail['by'],
                               item_detail['score'], item_detail['time'],
                               title, item_detail['type'],
                               DATA['item_url'].format(item_detail['id']))
            except:
                logger = logging.getLogger()
                logger.setLevel(logging.WARNING)
            rows += row.format(row_text=title, collapse=id_number) + '\n'
            id_number += 1
            r_number += 1

            if r_number == 2:  # Delete it to parse all items
                break
        r_number = 0
        panels += panel.format(
            category_name=category, rows=rows, collapse=panel_number) + '\n'
        panel_number += 1
        rows = ''