Python CraigsscraperItem Examples, CraigsScraper.items.CraigsscraperItem Python Examples

Example #1

0

Show file

File: CraigSpider.py Project: MMnemonic/craigslist_scraper

    def parse_scrapinghub_csv_file_recaptcha(self, response):
        with open(self.input_file) as csvfile:
            reader = csv.reader(csvfile)
            print("-----------------CSV Read------------------")
            i = 0
            for input_item in reader:
                if i > 0 and i < 3:
                    print(
                        "**********************PARSE CSV FILE************************"
                    )
                    print i

                    item = CraigsscraperItem()
                    item["condition"] = input_item[1]
                    item["description"] = input_item[2]
                    item["image"] = input_item[3]
                    item["keyword"] = input_item[4]
                    item["latitude"] = input_item[5]
                    item["longitude"] = input_item[6]
                    item["manufacturer"] = input_item[7]
                    item["model_name"] = input_item[8]
                    item["post_date"] = input_item[9]
                    item["post_id"] = input_item[10]
                    item["product_name"] = input_item[11]
                    item["size"] = input_item[12]
                    item["update_date"] = input_item[13]
                    item["url"] = input_item[14]

                    url = "/reply/nyc/" + item["url"].rsplit(
                        "/", 2)[1] + "/" + item["post_id"]

                    req = Request(response.urljoin(url),
                                  self.check_recaptcha_get)
                    req.meta["item"] = item
                    yield req

                i = i + 1

Example #2

0

Show file

File: CraigSpider.py Project: MMnemonic/craigslist_scraper

    def parse_item_detail(self, response):
        item = CraigsscraperItem()

        item["post_date"] = ""
        item["update_date"] = ""
        item["longitude"] = ""
        item["latitude"] = ""
        item["description"] = ""
        item["condition"] = ""
        item["manufacturer"] = ""
        item["model_name"] = ""
        item["size"] = ""
        item["image"] = ""
        item["url"] = response.url
        item["keyword"] = response.meta["keyword"]
        item["product_name"] = response.meta["product_name"]

        post_div = response.xpath("//div[@class='postinginfos']")

        if len(post_div) > 0:
            post_id = post_div.xpath(
                "p[contains(text(), 'post id')]/text()").extract_first()
            post_date = post_div.xpath(
                "p[contains(text(), 'posted:')]/time/text()").extract_first()
            update_date = post_div.xpath(
                "p[contains(text(), 'updated: ')]/time/text()").extract_first(
                )

            item['post_id'] = post_id.replace("post id: ", "")
            item['post_date'] = post_date
            item['update_date'] = update_date

        description = response.xpath(
            "//section[@id='postingbody']/text()").extract()
        if len(description) > 0:
            item['description'] = " ".join(description)

        mapbox_div = response.xpath("//div[@class='mapbox']")
        if len(mapbox_div) > 0:
            map_div = mapbox_div.xpath("//div[@id='map']")

            if len(map_div) > 0:
                longitude = map_div.xpath("@data-longitude").extract_first()
                latitude = map_div.xpath("@data-latitude").extract_first()

                if len(longitude) > 0:
                    item['longitude'] = longitude

                if len(latitude) > 0:
                    item['latitude'] = latitude

        attr_group_div = response.xpath("//p[@class='attrgroup']")

        if len(attr_group_div) > 0:
            #print ("=======================", attr_group_div.extract())
            condition = attr_group_div.xpath(
                "span[contains(text(),'condition')]/b/text()").extract_first()
            manufacturer = attr_group_div.xpath(
                "span[contains(text(),'manufacturer')]/b/text()"
            ).extract_first()
            model_name = attr_group_div.xpath(
                "span[contains(text(),'model name')]/b/text()").extract_first(
                )
            size = attr_group_div.xpath(
                "span[contains(text(),'size / dimensions')]/b/text()"
            ).extract_first()

            item['condition'] = condition
            item['manufacturer'] = manufacturer
            item['model_name'] = model_name
            item['size'] = size

        image_divs = response.xpath("//div[@id='thumbs']/a")
        #print ("=======================", response.body)

        if len(image_divs) > 0:
            image_item = []

            for row in image_divs:
                im_info = {}

                orginal_img = row.xpath("@href").extract()
                thumb_img = row.xpath("img/@src").extract()
                img_id = row.xpath("@data-imgid").extract()

                if len(orginal_img) > 0:
                    im_info['original'] = orginal_img[0]

                if len(thumb_img) > 0:
                    im_info['thumb'] = thumb_img[0]

                if len(img_id) > 0:
                    im_info['id'] = img_id[0]

                image_item.append(im_info)

            item["image"] = image_item
        else:
            one_image_div = response.xpath("//div[@class='swipe']//img")

            if len(one_image_div) > 0:
                image_item = []

                for row in one_image_div:
                    im_info = {}

                    orginal_img = row.xpath("@src").extract()

                    if len(orginal_img) > 0:
                        im_info['original'] = orginal_img[0]

                    image_item.append(im_info)

                item["image"] = image_item

        if item["condition"] == "like new" or item[
                "condition"] == "new" or item["condition"] == "excellent":
            if item["latitude"] != "" and item["longitude"] != "":
                yield item

Example #3

0

Show file

File: CraigSpider.py Project: MMnemonic/craigslist_scraper

    def parse_scrapinghub_csv_file_image(self, response):
        with open(self.input_file) as csvfile:
            reader = csv.reader(csvfile)
            print("-----------------CSV Read------------------")
            i = 0
            for input_item in reader:
                if i > 0:
                    print(
                        "**********************PARSE CSV FILE************************"
                    )
                    print i

                    item = CraigsscraperItem()
                    item["condition"] = input_item[1]
                    item["description"] = input_item[2]
                    item["image"] = input_item[3]
                    item["keyword"] = input_item[4]
                    item["latitude"] = input_item[5]
                    item["longitude"] = input_item[6]
                    item["manufacturer"] = input_item[7]
                    item["model_name"] = input_item[8]
                    item["post_date"] = input_item[9]
                    item["post_id"] = input_item[10]
                    item["product_name"] = input_item[11]
                    item["size"] = input_item[12]
                    item["update_date"] = input_item[13]
                    item["url"] = input_item[14]

                    #create folder to download image
                    folder_name = item["post_id"]
                    image = item["image"].split("},")

                    images = []
                    ind = 0
                    for row in image:
                        if ind == len(image) - 1:
                            if row != "":
                                item = json.loads(row)
                                images.append(item)
                        else:
                            row = row + "}"
                            item = json.loads(row)
                            images.append(item)

                        ind = ind + 1

                    for row in images:
                        if row["original"] is not None:
                            filename = row["original"].rsplit("/", 1)[1]
                            filepath = self.download_root_path + folder_name + "/" + filename
                            if os.path.isfile(filepath) == True:
                                print "ORIGINAL" + filepath
                            else:
                                req = self.set_proxies(row["original"],
                                                       self.download_image)
                                req.meta["folder"] = folder_name
                                yield req

                        try:
                            filename = row["thumb"].rsplit("/", 1)[1]
                            filepath = self.download_root_path + folder_name + "/" + filename
                            if os.path.isfile(filepath) == True:
                                print "THUMB" + filepath
                            else:
                                req = self.set_proxies(row["thumb"],
                                                       self.download_image)
                                req.meta["folder"] = folder_name
                                yield req
                        except KeyError:
                            print(
                                "**************Key Error for thumb***************"
                            )

                i = i + 1

Example #4

0

Show file

File: CraigSpider.py Project: MMnemonic/craigslist_scraper

    def duplication_check(self, reponse):
        with open(self.output_file_without_duplication, 'w') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow([
                "post_id", "post_date", "update_date", "longitude", "latitude",
                "description", "condition", "manufacturer", "model_name",
                "size", "image", "url", "keyword", "product_name"
            ])

        with open(self.input_file) as csvfile:
            item_list = []
            reader = csv.reader(csvfile)
            print("-----------------CSV Read------------------")
            i = 0
            for input_item in reader:
                if i > 0:
                    print(
                        "**********************PARSE CSV FILE************************"
                    )
                    print i

                    item = CraigsscraperItem()
                    item["condition"] = input_item[1]
                    item["description"] = input_item[2]
                    item["image"] = input_item[3]
                    item["keyword"] = input_item[4]
                    item["latitude"] = input_item[5]
                    item["longitude"] = input_item[6]
                    item["manufacturer"] = input_item[7]
                    item["model_name"] = input_item[8]
                    item["post_date"] = input_item[9]
                    item["post_id"] = input_item[10]
                    item["product_name"] = input_item[11]
                    item["size"] = input_item[12]
                    item["update_date"] = input_item[13]
                    item["url"] = input_item[14]

                    item_list.append(item)

                i = i + 1

            real_cnt = 0
            ind = 0
            for i in range(0, len(item_list)):
                row = item_list[i]
                print "*************************************"
                print i
                same_count = 0
                for j in range(i, len(item_list)):
                    row1 = item_list[j]
                    if row["post_id"] == row1["post_id"] or row[
                            "product_name"] == row1["product_name"]:
                        # print "********************************************************"
                        # print row
                        # print row1
                        same_count = same_count + 1
                        # print same_count

                if same_count == 1:
                    with open(self.output_file_without_duplication,
                              'a') as csvfile:
                        csv_writer = csv.writer(csvfile)
                        csv_writer.writerow([
                            row["post_id"], row["post_date"],
                            row["update_date"], row["longitude"],
                            row["latitude"], row["description"],
                            row["condition"], row["manufacturer"],
                            row["model_name"], row["size"], row["image"],
                            row["url"], row["keyword"], row["product_name"]
                        ])

                    real_cnt = real_cnt + 1

            print "********************************"
            print real_cnt