Exemple #1
0
import tablib as tablib

from mongotable.mongo_dict import MongoDict, COLLECTION

mongo = MongoDict()

data = tablib.Dataset()
data.headers = [
    "version_datetime", "version_datetime_string", 'entry_number', 'url'
]

for entry in mongo.get_collection_iterator(COLLECTION.OT_CATALOG):
    entry_dict = entry['value']
    row = []
    for key in data.headers:
        row.append(entry_dict[key])
    data.append(row)

print(data.csv)
with open("ot_catalog.csv", "w") as file:
    file.write(data.csv)
class OTRestaurantsSpider(Spider):
    name = 'ot_restaurants_spider.py'
    allowed_domains = ['web.archive.org']

    base_url = 'http://web.archive.org/'

    def __init__(self, **kwargs):
        super(OTRestaurantsSpider, self).__init__(name=None, **kwargs)
        self.limit = 0
        self.processed = 0
        self.mongo = MongoDict()
        self.gm = GoogleMap()

    def start_requests(self):
        limit = self.settings.get(
            'DO_FIRST') if self.settings.get('DO_FIRST') >= 1 else 9999999

        for entry in self.mongo.get_collection_iterator(
                COLLECTION.OT_CATALOG).sort('key',
                                            pymongo.ASCENDING).limit(limit):
            entry_dict = entry['value']
            request = Request(url=entry_dict['url'],
                              callback=self.parse_restaurant_page)
            request.meta['ot_catalog_key'] = entry['key'] + "_" + entry[
                'value']['entry_number']

            # if request.meta['ot_catalog_key'] in self.mongo.client[self.settings.get("OUTPUT_DB")].collection_names():
            #     self.logger.critical(entry['key'] + " skipped")
            #     continue

            # self.limit += 1
            # if self.limit >= self.settings.get('LIMIT_CATALOG'):
            #     return

            if entry['key'] != "20110720053652":
                self.logger.debug(entry['key'] + " skipped")
                continue
            self.logger.critical(entry['key'] + " REQUESTED")

            yield request

    def parse_restaurant_page(self, response: Response):
        self.logger.debug(response.meta['ot_catalog_key'] + " is received")
        for request in self.try_parse(response):
            yield request

    def try_parse(self, response: Response):
        selector = Selector(response)

        data_rows = selector.xpath('//tr[@class = "a" or @class = "r"]')

        if len(data_rows) == 0:
            data_rows = selector.xpath('//tr[contains(@class, "ResultRow")]')

        if len(data_rows) == 0:
            self.logger.error(response.url + " no data!")
            raise CloseSpider("data row is empty: " + response.url)

        self.logger.debug("Found: " + str(len(data_rows)))

        for row in data_rows:
            yield self.try_parse_row(row, response)

    def try_parse_row(self, row: Selector, response: Response):
        item = OTItem()

        item['ot_catalog_key'] = response.meta['ot_catalog_key']

        # extract name
        item['name'] = row.xpath('.//a[@href]/text()').extract_first()

        # extract neighborhood
        neighborhood = row.xpath('.//div[@class="nn"]/text()').extract_first()
        if neighborhood is not None:
            item['neighborhood'] = neighborhood.strip()

        if 'neighborhood' not in item:
            neighborhood = row.xpath(
                './/div[@class="d"]/text()').extract_first()
            if neighborhood is not None:
                item['neighborhood'] = neighborhood.strip().split("|")[0]

        # extract type
        type_r = row.xpath('.//div[@class="nf"]/text()').extract_first()
        if type_r is not None:
            item['type'] = type_r.strip()

        if 'type' not in item:
            type_r = row.xpath('.//div[@class="d"]/text()').extract_first()
            if type_r is not None:
                item['type'] = type_r.strip().split("|")[1]

        # extract price
        price = row.xpath('.//td[@class="p"]/text()').extract_first()
        if price is not None:
            item["price"] = len(price)

        if 'price' not in item:
            price = row.xpath('.//td[@class="PrCol"]/text()').extract_first()
            if price is not None:
                item["price"] = len(price)

        # extract url
        url = row.xpath('.//a[@class="r"]/@href').extract_first()
        if url is not None:
            item['url'] = urljoin(response.url, url)

        if 'url' not in item:
            url = row.xpath('.//a[@href]/@href').extract_first()
            if url is not None:
                item['url'] = urljoin(response.url, url)

        # extract stars
        stars = row.xpath(
            './/div[@class="Ratings"]/div/@title').extract_first()
        if stars is not None:
            item['stars'] = [float(s) for s in stars.split()
                             if is_float(s)][0] if stars else "-1"
        else:
            item['stars'] = -1

        # extract reviews
        reviews = row.xpath(
            './/span[@class="reviews"]/preceding-sibling::text()'
        ).extract_first()
        if reviews is not None:
            item['reviews'] = int(reviews)
        else:
            item['reviews'] = -1

        #
        request = Request(item['url'],
                          callback=self.extract_geo_fields,
                          dont_filter=True,
                          errback=self.err_yield_item)
        request.meta['item'] = item
        return request

    def err_yield_item(self, response: Response):
        item = response.meta['item']
        yield

    def extract_geo_fields(self, response: Response):
        item = response.meta['item']
        selector = Selector(response)

        try:
            address = selector.xpath(
                '//li[@class="RestProfileAddressItem"]/text()').extract()
            # self.logger.error("(" + "".join(address) + ")")
            # self.logger.error(1)

            if len(address) == 0:
                address = selector.xpath(
                    '//span[@id="RestSearch_lblFullAddress"]/text()').extract(
                    )
                # self.logger.error("(" + "".join(address) + ")")
                # self.logger.error(2)

            if len(address) == 0:
                address = selector.xpath(
                    '//div[@class="RestProfileAddress"]/text()').extract()
                if len("".join(address).strip()) == 0:
                    address = ""
                # self.logger.error("(" + "".join(address) + ")")
                # self.logger.error(3)

            if len(address) == 0:
                address = selector.xpath(
                    '//span[@id="ProfileOverview_lblAddressText"]/text()'
                ).extract()
                # self.logger.error("(" + "".join(address) + ")")
                # self.logger.error(4)

            if len(address) == 0:
                address = selector.xpath(
                    '//span[@itemprop="streetAddress"]/text()').extract()
                # self.logger.error("(" + "".join(address) + ")")
                # self.logger.error(5)

            if len(address) != 0:
                address = ",".join(
                    [str(line).strip().replace('\"', '') for line in address])
                item['address'] = address

            if len(address) == 0:
                raise KeyError

                # cleanup address to remove things in bracket
            # eg: 714 Seventh Avenue (inside Renaissance Hotel)
            start = item['address'].find('(')
            end = item['address'].find(')')
            if start != -1 and end != -1:
                item['address'] = item['address'][:start -
                                                  1] + item['address'][end +
                                                                       1:]

                # extract geocode
            item['geocode'] = self.gm.geocode(item['address'])

            if len(item['geocode']) == 0:
                self.logger.error("geocode empty: " + item['address'])
                raise KeyError

            item['geocode'] = item['geocode'][0]
            # extract county
            item['county'] = self.extract_county(
                item['geocode']['address_components'], item)

            # extract place_id
            item['place_id'] = item['geocode']['place_id']

            # set is_nyc
            item['is_nyc'] = self.gm.check_if_nyc(item['county'])

            item['extract_success'] = True
        except KeyError or IndexError:
            item['extract_success'] = False
            # self.logger.error("Extract failed. Saved anyway: " + str(item))

        yield item

    def verify(self, item: OTItem, field: str, response: Response):
        if field not in item or item[field] is None:
            raise CloseSpider("extract field failed: " + field + " " +
                              response.url)
        else:
            self.logger.debug("Success: " + str(item[field]))
            pass

    def extract_county(self, geocode, item):
        for entry in geocode:
            if 'administrative_area_level_2' in entry['types']:
                return entry['long_name']
        self.logger.critical("County Not found: " + str(item))