def fetch_listing(): global crawl_time out_0 = open("products-0.txt", "a") out_many = open("products-many.txt", "a") url = dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return # print(url) items = page.find_all("li", class_="s-result-item") log("Found {} items on {}".format(len(items), url)) if len(items) == 0: out_0.write(str(url) + "\n") else: out_many.write(str(url) + "\n") # input() # for item in items[:settings.max_details_per_listing]: # try: # out.write(item.get_text() + "\n") # except: # pass # product_url = get_url(item) # product_price = get_price(item) # product = ProductRecord( # title=product_title, # product_url=format_url(product_url), # listing_url=format_url(url), # price=product_price, # primary_img=product_image, # crawl_time=crawl_time # ) # product_id = product.save() # # download_image(product_image, product_id) # # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) page_number = int(url.split("&page=")[1]) + 1 enqueue_url(url.split("&page=")[0] + "&page=" + str(page_number)) pile.spawn(fetch_listing)
def fetch_listing(): global crawl_time url = dequeue_url() url = url.decode('utf-8') if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return items = page.findAll("div", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time ) product_id = product.save() # download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(): ''' This is the root function that green threads call. This is essentially step 1 (but step 0 is above!) ''' global crawl_time # Pop a random URL from the Redis listing_url_queue url = helpers.dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = helpers.make_request(url) if not page: return items = page.findAll("li", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = extractors.get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = extractors.get_title(item) product_url = extractors.get_url(item) product_price = extractors.get_price(item) product = models.ProductRecord( title=product_title, product_url=helpers.format_url(product_url), listing_url=helpers.format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time) product_id = product.save() helpers.download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) helpers.enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(): global crawl_time url = dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return items = page.findAll("li", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time ) product_id = product.save() # download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def exploit_listings_urls(platform): print('\n---\n\nStart exploitation\n\n---\n') # go through start urls time_watcher = models.TimeWatcher(MIN_SLEEP_TIME) sql_handler = models.SQLHandler() flag = True while flag: url = helpers.dequeue_url('listing_files', platform) if url: page = helpers.make_request(url, platform, time_watcher) if not page: continue try: handlers.handle_listing(page, platform, url, sql_handler) except Exception as e: helpers.queue_url(url, 'listing_files', platform) raise Exception('Exception: {}'.format(e)) else: flag = False
def fetch_products(): item = dequeue_url() print(item) page, html = make_request_cfg(item["url"]) document = {} #print(page) if page != None: captcha = get_captcha(page) #print("--------------") if captcha != None: print("[Warning] caught by captcha!!! id: {}".format( get_header_id())) enqueue_url(item) set_header_id((get_header_id() + 1) % 6) if captcha == None: #print("no captch") # look for subcategory links on this page #print(item['url']) #print(page) asins = get_asin(page) #print(asins) #print(len(asins)) titles = get_title(page) #print(titles) #print(len(titles)) #stars = get_star(page) #print(stars) #print("----------------") #print(len(stars)) #print("----------------") #reviewnums = get_reviewnum(page) #print(reviewnums) #print(len(reviewnums)) if asins != None: if len(asins) != 0: for index in range(0, len(asins)): document = {} document['asin'] = asins[index] #document['title'] = titles[index] document['title'] = "" #document['star'] = stars[index] #document['reviewnum'] = reviewnums[index] document['category1'] = item['category1'] document['category2'] = item['category2'] document['category3'] = item['category3'] document['category4'] = item['category4'] document['category5'] = item['category5'] document['category6'] = item['category6'] document['category7'] = item['category7'] document['date'] = datetime.now() print(document) #print("inserting") save_DB(document) #complete-> 1 prod = {} prod['cat1'] = item['category1'] prod['cat2'] = item['category2'] prod['cat3'] = item['category3'] prod['cat4'] = item['category4'] prod['cat5'] = item['category5'] prod['cat6'] = item['category6'] prod['cat7'] = item['category7'] if item['url'].split("page=")[1] == "1": prod['url'] = item['url'].replace("?bbn=1&dc&", "?") else: prod['url'] = item['url'] prod['completed'] = 1 #print("save complete as 1") save_DB_completed(prod) else: print("[Warning] missing product info1") prod = {} prod['cat1'] = item['category1'] prod['cat2'] = item['category2'] prod['cat3'] = item['category3'] prod['cat4'] = item['category4'] prod['cat5'] = item['category5'] prod['cat6'] = item['category6'] prod['cat7'] = item['category7'] if item['url'].split("page=")[1] == "1": prod['url'] = item['url'].replace("?bbn=1&dc&", "?") else: prod['url'] = item['url'] prod['completed'] = 1 save_DB_completed(prod) else: print(item) print("[Warning] missing product info2") prod = {} prod['cat1'] = item['category1'] prod['cat2'] = item['category2'] prod['cat3'] = item['category3'] prod['cat4'] = item['category4'] prod['cat5'] = item['category5'] prod['cat6'] = item['category6'] prod['cat7'] = item['category7'] if item['url'].split("page=")[1] == "1": prod['url'] = item['url'].replace("?bbn=1&dc&", "?") else: prod['url'] = item['url'] prod['completed'] = 1 save_DB_completed(prod)
def fetch_products_detail(): asin = dequeue_url() url = 'https://www.amazon.com/dp/' + asin print(url) DB_product = get_DB_product(asin)[0] item = {} item['code'] = DB_product['code'] item['title'] = DB_product['title'] item['price'] = DB_product['price'] item['byLineInfo'] = DB_product['byLineInfo'] item['sellerNum'] = DB_product['sellerNum'] item['salesRank'] = DB_product['salesRank'] item['avgRating'] = DB_product['avgRating'] item['ratingNum'] = DB_product['ratingNum'] page, html = make_request_cfg(url) product = {} if page == 503: return None elif page != None: captcha = get_captcha(page) #print(captcha) if captcha != None: print("[Warning] caught by captcha!!! id: {}".format( get_header_id())) enqueue_url(asin) set_header_id((get_header_id() + 1) % 6) #set_header_id( (get_header_id()) % 6) else: #output.write(str(page)) #output.close() # look for subcategory links on this page title = get_title_detail(page) #print(title) price = get_price_detail(page) #print(price) byLineInfo = get_byLineInfo_detail(page) #print(byLineInfo) sellerNum = get_sellerNum_detail(page) #print(sellerNum) salesRank = get_salesRank_detail(page) #print(salesRank) avgRating = get_avgRating_detail(page) #print(avgRating) ratingNum = get_ratingNum_detail(page) #print(ratingNum) #print("----------------") product['code'] = asin product['title'] = title product['price'] = price product['byLineInfo'] = byLineInfo product['sellerNum'] = sellerNum product['salesRank'] = salesRank product['avgRating'] = avgRating product['ratingNum'] = ratingNum #print(item) #print("-----------------------vs-------------------------") #print(product) if item != product: #price change flag = 0 message_price = '' message_salesRank = '' message_sellerNum = '' if item['price'] != product['price']: print('price changed') message_price = str(item['price']) + '-> ' + str( product['price']) + '\n' flag = 1 #salesRank change if item['salesRank'] != product['salesRank']: if len(item['salesRank']) == 2 or len( item['salesRank']) == 4 or len( item['salesRank']) == 6: # increased cat = item['salesRank'][1] previousRank = int(item['salesRank'][0].replace( ",", "")) currentRank = int(product['salesRank'][0].replace( ",", "")) sign = '' threshold = 5 if previousRank != currentRank: if previousRank < currentRank: diff = currentRank - previousRank diff_percentage = float(diff / previousRank * 100) sign = '+' #decreased elif previousRank > currentRank: diff = previousRank - currentRank diff_percentage = float(diff / previousRank * 100) sign = '-' if diff_percentage > threshold and sign == '+': diff_percentage = '%.2f' % diff_percentage message_salesRank += cat + ' (' + sign + str( diff_percentage) + '%, ' + sign + str( diff ) + ') ' + product['salesRank'][0] + '\n' flag = 1 if len(item['salesRank']) == 4 or len( item['salesRank']) == 6: cat = item['salesRank'][3] previousRank = int(item['salesRank'][2].replace( ",", "")) currentRank = int(product['salesRank'][2].replace( ",", "")) sign = '' if previousRank != currentRank: if previousRank < currentRank: diff = currentRank - previousRank diff_percentage = float(diff / previousRank) sign = '+' #decreased elif previousRank > currentRank: diff = previousRank - currentRank diff_percentage = float(diff / previousRank) sign = '-' if diff_percentage > threshold and sign == '+': diff_percentage = '%.2f' % diff_percentage message_salesRank += cat + ' (' + sign + str( diff_percentage) + '%, ' + sign + str( diff ) + ') ' + product['salesRank'][2] + '\n' flag = 1 if len(item['salesRank']) == 6: cat = item['salesRank'][5] previousRank = int(item['salesRank'][4].replace( ",", "")) currentRank = int(product['salesRank'][4].replace( ",", "")) sign = '' if previousRank != currentRank: if previousRank < currentRank: diff = currentRank - previousRank diff_percentage = float(diff / previousRank) sign = '+' #decreased elif previousRank > currentRank: diff = previousRank - currentRank diff_percentage = float(diff / previousRank) sign = '-' if diff_percentage > threshold and sign == '+': diff_percentage = '%.2f' % diff_percentage message_salesRank += cat + ' (' + sign + str( diff) + '%, ' + sign + str( diff ) + ') ' + product['salesRank'][4] + '\n' flag = 1 #print(message_salesRank) #seller number change if int(item['sellerNum']) != int(product['sellerNum']): print('sellerNum changed') message_sellerNum = str(item['sellerNum']) + '-> ' + str( product['sellerNum']) + '\n' flag = 1 if flag == 1: message = str(datetime.now()).split('.')[ 0] + '\t' + 'https://amazon.com/dp/' + str(asin) + '\n' if message_price != '': message += 'price changed: ' + message_price if message_salesRank != '': message += 'sales rank changed:\n' + message_salesRank if message_sellerNum != '': message += 'seller number changed: ' + message_sellerNum print(message) #trigger_slackmessage(message) update_DB_detail(asin, product)
def fetch_listing(): global crawl_time url, category_code, mode = dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return # make request through selenium products_robot = ProductsRobot().run(url) page = BeautifulSoup(products_robot.page_source, "html.parser") try: element = WebDriverWait(products_robot, 2).until( EC.presence_of_element_located((By.ID, "prodDetails")) ) except TimeoutException as e: pass finally: products_robot.quit() # put this login in get_products_link items = [] items_container = page.find(id="mainResults") if items_container: items = items_container.find_all(id=re.compile('result_\d*')) log("Found {} items on {}".format(len(items), url)) crawler = CrawlerAmazonContext().define_type_product_detail_crawler(category_code) if mode == settings.LINK_DETAIL_PRODUCT: crawler.get_products_link(items, category_code) elif mode == settings.PRODUCT_CRAWLER: crawler.get_product_info(url, page, category_code) # page, html = make_request(url) TODO: delete # if not page: # return # # items = page.findAll("li", "s-result-item") ''' for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time, category_code=category_code, category=CATEGORY_LABELS[int(category_code)] ) product_id = product.save() # download_image(product_image, product_id) ''' # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"], category_code) pile.spawn(fetch_listing)