def parse_scrapinghub_csv_file_recaptcha(self, response): with open(self.input_file) as csvfile: reader = csv.reader(csvfile) print("-----------------CSV Read------------------") i = 0 for input_item in reader: if i > 0 and i < 3: print( "**********************PARSE CSV FILE************************" ) print i item = CraigsscraperItem() item["condition"] = input_item[1] item["description"] = input_item[2] item["image"] = input_item[3] item["keyword"] = input_item[4] item["latitude"] = input_item[5] item["longitude"] = input_item[6] item["manufacturer"] = input_item[7] item["model_name"] = input_item[8] item["post_date"] = input_item[9] item["post_id"] = input_item[10] item["product_name"] = input_item[11] item["size"] = input_item[12] item["update_date"] = input_item[13] item["url"] = input_item[14] url = "/reply/nyc/" + item["url"].rsplit( "/", 2)[1] + "/" + item["post_id"] req = Request(response.urljoin(url), self.check_recaptcha_get) req.meta["item"] = item yield req i = i + 1
def parse_item_detail(self, response): item = CraigsscraperItem() item["post_date"] = "" item["update_date"] = "" item["longitude"] = "" item["latitude"] = "" item["description"] = "" item["condition"] = "" item["manufacturer"] = "" item["model_name"] = "" item["size"] = "" item["image"] = "" item["url"] = response.url item["keyword"] = response.meta["keyword"] item["product_name"] = response.meta["product_name"] post_div = response.xpath("//div[@class='postinginfos']") if len(post_div) > 0: post_id = post_div.xpath( "p[contains(text(), 'post id')]/text()").extract_first() post_date = post_div.xpath( "p[contains(text(), 'posted:')]/time/text()").extract_first() update_date = post_div.xpath( "p[contains(text(), 'updated: ')]/time/text()").extract_first( ) item['post_id'] = post_id.replace("post id: ", "") item['post_date'] = post_date item['update_date'] = update_date description = response.xpath( "//section[@id='postingbody']/text()").extract() if len(description) > 0: item['description'] = " ".join(description) mapbox_div = response.xpath("//div[@class='mapbox']") if len(mapbox_div) > 0: map_div = mapbox_div.xpath("//div[@id='map']") if len(map_div) > 0: longitude = map_div.xpath("@data-longitude").extract_first() latitude = map_div.xpath("@data-latitude").extract_first() if len(longitude) > 0: item['longitude'] = longitude if len(latitude) > 0: item['latitude'] = latitude attr_group_div = response.xpath("//p[@class='attrgroup']") if len(attr_group_div) > 0: #print ("=======================", attr_group_div.extract()) condition = attr_group_div.xpath( "span[contains(text(),'condition')]/b/text()").extract_first() manufacturer = attr_group_div.xpath( "span[contains(text(),'manufacturer')]/b/text()" ).extract_first() model_name = attr_group_div.xpath( "span[contains(text(),'model name')]/b/text()").extract_first( ) size = attr_group_div.xpath( "span[contains(text(),'size / dimensions')]/b/text()" ).extract_first() item['condition'] = condition item['manufacturer'] = manufacturer item['model_name'] = model_name item['size'] = size image_divs = response.xpath("//div[@id='thumbs']/a") #print ("=======================", response.body) if len(image_divs) > 0: image_item = [] for row in image_divs: im_info = {} orginal_img = row.xpath("@href").extract() thumb_img = row.xpath("img/@src").extract() img_id = row.xpath("@data-imgid").extract() if len(orginal_img) > 0: im_info['original'] = orginal_img[0] if len(thumb_img) > 0: im_info['thumb'] = thumb_img[0] if len(img_id) > 0: im_info['id'] = img_id[0] image_item.append(im_info) item["image"] = image_item else: one_image_div = response.xpath("//div[@class='swipe']//img") if len(one_image_div) > 0: image_item = [] for row in one_image_div: im_info = {} orginal_img = row.xpath("@src").extract() if len(orginal_img) > 0: im_info['original'] = orginal_img[0] image_item.append(im_info) item["image"] = image_item if item["condition"] == "like new" or item[ "condition"] == "new" or item["condition"] == "excellent": if item["latitude"] != "" and item["longitude"] != "": yield item
def parse_scrapinghub_csv_file_image(self, response): with open(self.input_file) as csvfile: reader = csv.reader(csvfile) print("-----------------CSV Read------------------") i = 0 for input_item in reader: if i > 0: print( "**********************PARSE CSV FILE************************" ) print i item = CraigsscraperItem() item["condition"] = input_item[1] item["description"] = input_item[2] item["image"] = input_item[3] item["keyword"] = input_item[4] item["latitude"] = input_item[5] item["longitude"] = input_item[6] item["manufacturer"] = input_item[7] item["model_name"] = input_item[8] item["post_date"] = input_item[9] item["post_id"] = input_item[10] item["product_name"] = input_item[11] item["size"] = input_item[12] item["update_date"] = input_item[13] item["url"] = input_item[14] #create folder to download image folder_name = item["post_id"] image = item["image"].split("},") images = [] ind = 0 for row in image: if ind == len(image) - 1: if row != "": item = json.loads(row) images.append(item) else: row = row + "}" item = json.loads(row) images.append(item) ind = ind + 1 for row in images: if row["original"] is not None: filename = row["original"].rsplit("/", 1)[1] filepath = self.download_root_path + folder_name + "/" + filename if os.path.isfile(filepath) == True: print "ORIGINAL" + filepath else: req = self.set_proxies(row["original"], self.download_image) req.meta["folder"] = folder_name yield req try: filename = row["thumb"].rsplit("/", 1)[1] filepath = self.download_root_path + folder_name + "/" + filename if os.path.isfile(filepath) == True: print "THUMB" + filepath else: req = self.set_proxies(row["thumb"], self.download_image) req.meta["folder"] = folder_name yield req except KeyError: print( "**************Key Error for thumb***************" ) i = i + 1
def duplication_check(self, reponse): with open(self.output_file_without_duplication, 'w') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow([ "post_id", "post_date", "update_date", "longitude", "latitude", "description", "condition", "manufacturer", "model_name", "size", "image", "url", "keyword", "product_name" ]) with open(self.input_file) as csvfile: item_list = [] reader = csv.reader(csvfile) print("-----------------CSV Read------------------") i = 0 for input_item in reader: if i > 0: print( "**********************PARSE CSV FILE************************" ) print i item = CraigsscraperItem() item["condition"] = input_item[1] item["description"] = input_item[2] item["image"] = input_item[3] item["keyword"] = input_item[4] item["latitude"] = input_item[5] item["longitude"] = input_item[6] item["manufacturer"] = input_item[7] item["model_name"] = input_item[8] item["post_date"] = input_item[9] item["post_id"] = input_item[10] item["product_name"] = input_item[11] item["size"] = input_item[12] item["update_date"] = input_item[13] item["url"] = input_item[14] item_list.append(item) i = i + 1 real_cnt = 0 ind = 0 for i in range(0, len(item_list)): row = item_list[i] print "*************************************" print i same_count = 0 for j in range(i, len(item_list)): row1 = item_list[j] if row["post_id"] == row1["post_id"] or row[ "product_name"] == row1["product_name"]: # print "********************************************************" # print row # print row1 same_count = same_count + 1 # print same_count if same_count == 1: with open(self.output_file_without_duplication, 'a') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow([ row["post_id"], row["post_date"], row["update_date"], row["longitude"], row["latitude"], row["description"], row["condition"], row["manufacturer"], row["model_name"], row["size"], row["image"], row["url"], row["keyword"], row["product_name"] ]) real_cnt = real_cnt + 1 print "********************************" print real_cnt