def main(): print("[+] Starting pokemon scraper bot") print("[+] Loading ./config.json file") with open("config.json") as json_data_file: config = json.load(json_data_file) print("[+] Success config loaded") driver = bot.web.gen_driver() raw_data = bot.pokemon.gather_base_pokemon(driver) data = bot.pokemon.gather_specific_pokemon_data(driver, raw_data, config["client_id"]) print("[+] Connecting to db " + config["db_name"]) db_connection = db.connect_to_db(config["db_name"]) print("[+] Success connected to db") print("[+] Adding items to " + config["collection_name"]) for item in data: db.insert_item(db_connection, config["collection_name"], item) print("[+] Success all items have been added") print("[+] Scraper bot complete")
import wowhead_scrape, sys, pickle, db tsm_data = pickle.load(open(sys.argv[1], "rb")) for item_id in tsm_data: if db.get_item(item_id) is None: print("Scanning item " + item_id) item = wowhead_scrape.scrape_item(item_id) print(item) db.insert_item(item) print('Done') sys.stdout.flush()
jobs = q.getjobs() while jobs: for job in jobs: print 'job', job gpool.spawn(handle, job, queue=q) gpool.join() jobs = q.getjobs() def handle(job, *args, **kwargs): queue = kwargs['queue'] task = json.loads(job) url = task["url"] status, source = fetcher.fetch(url, use_proxy=False) logger.info('%s|%s' % (url, status)) try: _, ucontent = encoding.html_to_unicode('', source) except Exception, e: print e item = extracter.extract_sohutv_data_by_regex(url, ucontent) db.insert_item(item) urls = extracter.extract_sohutv(url, source) for i in urls: if queue.check_fetched(config.BITMAP, i): continue queue.lpush('{"url": "%s"}' % i) return urls if __name__ == '__main__': work()
s['name'], 'url': clean_url(domain), 'socialScore': get_popularity('http://' + url)['totalScore'], 'worldRank': world_rank, 'countryRank': country_rank, 'aboutUsUrl': '' if meta['aboutUsUrl'] in ['http://#', 'https://#' ] else meta['aboutUsUrl'], 'contactUsUrl': '' if meta['contactUsUrl'] in ['http://#', 'https://#'] else meta['contactUsUrl'], 'faviconUrl': '', 'title': '', "verifiedByUserId": "c7d23f36-ca28-53fc-bbeb-5de9a4b05d6a", 'timestamp': r.now().in_timezone('+08:00') } print(clean_url(domain)) insert_item(info, 'sources') time.sleep(1)
print("Please login before continue.") username = input("Enter username: "******"Enter password: "******"admin": choice = get_admins_option() if choice == 1: items = db.get_items() for x in items: print("Item Name: {} \t Item Price: {}".format(x[1], x[2])) if choice == 2: item_name = input("Enter product name: ") price = int(input("Enter price: ")) add_product = db.insert_item(item_name, price, user[0]) if add_product: print("Product added successfully") if choice == 3: item_id = input("Enter ID to delete item: ") response = db.remove_item(item_id) if response: print("Item removed successfully!!!!!") if choice == 4: order_report = db.get_all_orders() print("ID \t items \t user \t Total Amount") for x in order_report: id = x[0] item_name = get_item_name(db, x[1]) usersname = get_users_name(db, x[3]) print(x[0], "\t", item_name, "\t", usersname, "\t", x[4])
def load(self, file_name): """Loads a CSV file.""" for table in tables: db.add_table(tables[table]) body_data = [] with open(file_name, encoding='utf-8') as csv_file: reader = csv.reader(csv_file, delimiter=',') # skip header next(reader) # read body data for row in reader: body_data.append(row) # header data is of the form; this is manually taken from the csv file # 0: 'id' # 1: 'media_type' # 2: 'name' # 3: 'short_name' # 4: 'long_description' # 5: 'short_description' # 6: 'created_at' # 7: 'updated_at' # 8: 'review_url' # 9: 'review_score' # 10: 'slug' # 11: 'genres' # 12: 'created_by' # 13: 'published_by' # 14: 'franchises' # 15: 'regions' # for the strings, the following characters must be escaped: < > ' " & # for header indices 11 through 15, these are lists of comma separated data surrounded by curly braces # these must be separated and put in the appropriate MySQL table # process each row for row in body_data: # id is skipped as a new id will be created by MySQL # media type is a simple string: 'Movie', 'Show', 'Comic' media_type = row[1] # full name of the item name = html.escape(row[2]) # shortened name of the item short_name = html.escape(row[3]) # short and long descriptions are both raw HTML elements and must be escaped for safety long_desc = html.escape(row[4]) short_desc = html.escape(row[5]) # create/update timestamps are not extracted as they will be automatically inserted by MySQL # the link is escaped as well for safety; though this may cause corruption review_url = html.escape(row[8]) # the review score is a decimal number from 0.0 to 10.0 review_score = float(row[9]) # extract the slug slug = html.escape(row[10]) # parse the list of genres genres = self.parse_list(row[11]) # parse the list of studios studios = self.parse_list(row[12]) # parse the list of publishers publishers = self.parse_list(row[13]) # parse the list of franchises franchises = self.parse_list(row[14]) # parse the list of regions regions = self.parse_list(row[15]) # check if this item already exists by name if db.get_one_item(name=name) != None: continue item_id = db.insert_item(media_type, name, short_name, long_desc, short_desc, review_url, review_score, slug) db.insert_genre_item_joins(genres, item_id) db.insert_studio_item_joins(studios, item_id) db.insert_publisher_item_joins(publishers, item_id) db.insert_franchise_item_joins(franchises, item_id) db.insert_region_item_joins(regions, item_id)
'crawlerName': 'credible crawler' }) if re.search('/(category|gallery|photos?)/', article.url, re.IGNORECASE): if PY_ENV == 'development': print('\n(NOT AN ARTICLE PAGE) Skipped: ' + str(article.url) + '\n') slp_time = insert_log( source_id, 'articleCrawl', 'error', float(time.clock() - start_time), { 'articleUrl': article.url, 'errorMessage': 'NOT AN ARTICLE PAGE', 'crawlerName': 'credible crawler' }) insert_item({'id': article.id}, 'errorArticles') continue try: article.download() article.parse() title = article.title title_split = article.title.split('|') if len(title_split) != 1: title = title_split[0].strip() pattern = re.compile(source.brand, re.IGNORECASE) body = pattern.sub('', article.text) categories = categorize(body)
rows = '' panels = '' panel_number = 1 id_number = 1 r_number = 0 # db.add_category_table() # fill category table, call one time parser = Parser() for category in DATA['categories']: category_ids = parser.get_category_ids(category) for item in category_ids: item_detail = parser.get_item_details(item) title = item_detail['title'].replace('\'', '"') # Try insert data in "item" table try: db.insert_item(category, item_detail['id'], item_detail['by'], item_detail['score'], item_detail['time'], title, item_detail['type'], DATA['item_url'].format(item_detail['id'])) except: logger = logging.getLogger() logger.setLevel(logging.WARNING) rows += row.format(row_text=title, collapse=id_number) + '\n' id_number += 1 r_number += 1 if r_number == 2: # Delete it to parse all items break r_number = 0 panels += panel.format( category_name=category, rows=rows, collapse=panel_number) + '\n' panel_number += 1 rows = ''