parent_asin = response.meta['parent_asin'] if 'a-size-medium a-color-price\\">' in response.body: ajax_price = response.body.split('a-size-medium a-color-price\\">')[-1].split('<\/')[0].replace('$','') ajax_price = re.findall(r'\d+',ajax_price)[0] csv_row[5] = converter.cleanup_price(ajax_price) child_asin = csv_row[1] csv_dict[parent_asin][csv_row[1]] = csv_row if __name__ == '__main__': process = CrawlerProcess() process.crawl(Amazon_Spider) process.start() mywriter = mycsv.initialize_csv('Amazon_Running_Shoes.csv') for parent_asin, child_dicts in csv_dict.iteritems(): mywriter.writerow(child_dicts['Parent_Row']) del child_dicts['Parent_Row'] for child_asin, values in child_dicts.iteritems(): if values[5] != '': mywriter.writerow(values)
def parse_price(self, response): csv_row = response.meta['row'] parent_asin = response.meta['parent_asin'] if 'a-size-medium a-color-price\\">' in response.body: ajax_price = response.body.split( 'a-size-medium a-color-price\\">')[-1].split('<\/')[0].replace( '$', '') ajax_price = re.findall(r'\d+', ajax_price)[0] csv_row[5] = converter.cleanup_price(ajax_price) child_asin = csv_row[1] csv_dict[parent_asin][csv_row[1]] = csv_row if __name__ == '__main__': process = CrawlerProcess() process.crawl(Amazon_Spider) process.start() mywriter = mycsv.initialize_csv('Amazon_Running_Shoes.csv') for parent_asin, child_dicts in csv_dict.iteritems(): mywriter.writerow(child_dicts['Parent_Row']) del child_dicts['Parent_Row'] for child_asin, values in child_dicts.iteritems(): if values[5] != '': mywriter.writerow(values)
# break def parse_price(self, response): csv_row = response.meta['row'] parent_asin = response.meta['parent_asin'] if 'a-size-medium a-color-price\\">' in response.body: ajax_price = response.body.split('a-size-medium a-color-price\\">')[-1].split('<\/')[0].replace('$','') ajax_price = re.findall(r'\d+',ajax_price)[0] csv_row[5] = converter.cleanup_price(ajax_price) child_asin = csv_row[1] csv_dict[parent_asin][csv_row[1]] = csv_row if __name__ == '__main__': process = CrawlerProcess() process.crawl(SB_X) process.start() mywriter = mycsv.initialize_csv('SB_X.csv') for parent_asin, child_dicts in csv_dict.iteritems(): mywriter.writerow(child_dicts['Parent_Row']) del child_dicts['Parent_Row'] for child_asin, values in child_dicts.iteritems(): if values[5] != '': mywriter.writerow(values)
def parse_product(self, response): print "Parsing Product" sel = Selector(response) global output_file, count_output, mywriter if mycsv.calculate_filesize(output_file) > (22): # Limiting each output file to 22MB output_file = "%s-%s" % (output_file.split("-")[0], str(count_output)) count_output += 1 mywriter = mycsv.initialize_csv(output_file, category_name) dict = {} browse_nodes_list = [ node.strip().split("=")[-1] for node in sel.xpath("//div[@id='wayfinding-breadcrumbs_feature_div']/ul//a/@href").extract() ] dict.update(converter.clean_browsenodes(browse_nodes_list, category_name)) dict["item_name"] = sel.xpath("//span[@id='productTitle']/text()").extract()[0] dict["external_product_id"] = parent_ASIN = ( sel.xpath("//div[@id='tell-a-friend']/@data-dest").extract()[0].split("parentASIN=")[-1].split("&")[0] ) dict["item_sku"] = dict["part_number"] = "LYS" + dict["external_product_id"] try: dict["brand_name"] = sel.xpath("//a[@id='brand']/text()").extract()[0] except: dict["brand_name"] = sel.xpath("//a[@id='brand']/@href").extract()[0].split("/")[1] dict["manufacturer"] = dict["brand_name"] dict["item_length"], dict["item_height"], dict["item_width"], dict[ "item_dimensions_unit_of_measure" ] = converter.clean_dimensions( sel.xpath("//li[contains(text(),'inches')][contains(text(),'x')]/text()").extract() ) dict["product_description"] = " ".join( x for x in sel.xpath("//div[@id='productDescription']/p/text()").extract() ) dict.update( converter.clean_bullet_points(sel.xpath("//ul[@class='a-vertical a-spacing-none']//span/text()").extract()) ) dict["parent_child"] = "Parent" dict["department_name1"] = dict["target_gender"] = converter.clean_department_name(dict["item_name"]) dict["generic_keywords1"] = dict["generic_keywords"] = dict["item_name"] output__dict = mycsv.default_values(dict["item_name"], category_name) output__dict.update(dict) variant_script = sel.xpath( "//script[@language='JavaScript'][contains(text(),'window.isTwisterAUI = 1')]" ).extract() if ("shoes" in category_name.lower() and variant_script) or variant_script: print "Variants" """ Initializing Dictionaries for Variants(Asin, Variant Values), Pricing(Asin, Price) and Images(Asin, Images) """ variant_script = sel.xpath( "//script[@language='JavaScript'][contains(text(),'window.isTwisterAUI = 1')]" ).extract()[0] image_script = sel.xpath("//script[@type='text/javascript'][contains(text(),'customerImages')]").extract()[ 0 ] variant_dict, output__dict["variation_theme"] = variants.clean_variants( variant_script, image_script, dict, mywriter, category_name ) """Writing Parent Row""" mycsv.write__csv(output__dict, mywriter) """Writes Variants to CSV""" for asin, child_dict in variant_dict.iteritems(): mycsv.write__csv(child_dict, mywriter) else: images = sel.xpath("//img[@id='landingImage']/@data-a-dynamic-image").extract()[0] images = re.findall(r"(http.*?.jpg)", images) output__dict["main_image_url"] = images[0] for index, image in enumerate(images[1:], 1): output__dict["other_image_url" + str(index)] = image if index == 3: break output__dict["variation_theme"] = "" output__dict["standard_price"] = converter.calculate_price( sel.xpath("//span[@id='priceblock_ourprice']/text()| //span[@id='priceblock_saleprice']/text()") .extract()[0] .split("-")[-1] .strip("$") ) mycsv.write__csv(output__dict, mywriter)
mycsv.write__csv(output__dict, mywriter) # def start_crawl(choice): # print sys.argv[0] # global output_file, category_name # category_name = choice # start_urls = get_start_urls(category_name) # output_file = category_name # mywriter = mycsv.initialize_csv(output_file, category_name) # process = CrawlerProcess() # process.crawl(Main_Scrapper, start_urls = start_urls) # process.start() if __name__ == "__main__": print "Main Started" global start_urls, output_file, category_name category_name = " ".join(arg.replace("Mens", "Men's").replace("Womens", "Women's") for arg in sys.argv[1:]) start_urls = start_urls.get_start_urls(category_name) output_file = category_name mywriter = mycsv.initialize_csv(output_file, category_name) process = CrawlerProcess() process.crawl(Main_Scrapper, start_urls=start_urls) # process.crawl(Main_Scrapper, start_urls = [ # 'http://www.amazon.com/Timberland-Earthkeepers-Kempton-Oxford-Grain/dp/B00L2P58P4/']) # 'http://www.amazon.com/Magellan-eXplorist-510-Waterproof-Hiking/dp/B003Y5H17I']) process.start()
print 'x' # dict = response.meta['dict'] sel = Selector(response) prime_url = sel.xpath("//li[@class='refinementImage']/a[img/@alt='Prime Eligible']/@href").extract()[0] print prime_url # if dict['Category'] == '': # dict['Category'] = sel.xpath("//select[@class='nav-search-dropdown searchSelect']/option/text()").extract()[0] # row = [ prime_url, dict['Option'], dict['Category'] ] # print row # mywriter.writerow(row) # # def parse(self, response): # sel = Selector(response) # global node_list # node = sel.xpath("//select[@class='nav-search-dropdown searchSelect']/option/@value").extract()[0].strip('node=') # name = sel.xpath("//select[@class='nav-search-dropdown searchSelect']/option/text()").extract()[0] # node_list[node] = name # mywriter.writerow([node, name, response.url]) if __name__ == '__main__': mywriter = mycsv.initialize_csv('NodeList', 'shoes') process = CrawlerProcess() process.crawl(Node_Finder) process.start()
def parse_price(self, response): csv_row = response.meta['row'] parent_asin = response.meta['parent_asin'] if 'a-size-medium a-color-price\\">' in response.body: ajax_price = response.body.split( 'a-size-medium a-color-price\\">')[-1].split('<\/')[0].replace( '$', '') ajax_price = re.findall(r'\d+', ajax_price)[0] csv_row[5] = converter.cleanup_price(ajax_price) child_asin = csv_row[1] csv_dict[parent_asin][csv_row[1]] = csv_row if __name__ == '__main__': process = CrawlerProcess() process.crawl(SB_X) process.start() mywriter = mycsv.initialize_csv('SB_X.csv') for parent_asin, child_dicts in csv_dict.iteritems(): mywriter.writerow(child_dicts['Parent_Row']) del child_dicts['Parent_Row'] for child_asin, values in child_dicts.iteritems(): if values[5] != '': mywriter.writerow(values)