Beispiel #1
0
    def get_filename_from_url(url: str) -> Dict[str, str]:
        """
        md5(domain)/md5(url[domain:])
        """
        response: Dict[str, str] = dict()

        domain = extract_domain(url=url)
        path = extract_path(url=url)

        folder = hashlib.md5(domain.encode()).hexdigest()
        file = hashlib.md5(path.encode()).hexdigest()

        response.update({
            'folder': f'{folder}/{file}',
            'file': datetime.today().strftime("%Y-%m-%d") + '.body'
        })

        return response
def get_xpaths(table: str, url: str) -> Dict[str, str]:
    """Docstring for the get_xpath function.

    Query DynamoDB using the domain for the
    XPath configuration on the given Table ``table``.

    Args:
        param1 (str) table:
            The DynamoDB table name to get the XPath obj from
        param2 (str) url:
            The url that will be extracted from the domain to make the query

    Returns:
        Object containing the XPath configurations from DynamoDB
    """
    _table = dynamodb.Table(table)
    domain: str = extract_domain(url)

    response = _table.query(
        IndexName='domain-index',
        KeyConditionExpression=conditions.Key('domain').eq(domain)
    )

    return response['Items'][0]
def run(event, context) -> Dict[str, str]:
    """Docstring for price_verifier:run function

    Receives a dict from the Scrape Spider with
    the crawled information to check if there is any
    price variation, if has variation it sends that
    to the Finisher within Scrape Information

    The received dict may look like this:
    -  {
        'executionId': HASH,
        'dynamo': {
            'table': ...,
            ...
        }
    }
    """
    response: Dict[str, str] = {'id': event['executionId']}
    price_variation: Dict[str, str] = {'id': str(uuid4())}
    current_date: Date = datetime.now()
    content: Dict[str, str] = loads(
        b64decode(event['dynamo']['content'].encode()).decode())
    price_verifier: ClassVar = DynamoUtils(environ['PRICE_VARIATION'])
    check: Dict[str, str] = None

    check_objects = price_verifier.get({
        'index': 'url-index',
        'key': 'url',
        'value': content['url']
    })

    for item in check_objects:
        if item['last_check'] == 'true':
            check = item

    if check:
        print(
            f"The following URL already exists on the Database: [{content['url']}]"
        )

        if content['price'] == check['price']:
            print('The last check price is the same as today.')

        else:
            price_variation.update({
                'year': current_date.strftime('%Y'),
                'month': current_date.strftime('%m'),
                'day': current_date.strftime('%d'),
                'last_check': 'true',
                'url': content['url'],
                'price': content['price'],
                'domain': check['domain']
            })

            # Update the current price status[last_check] on DynamoDB to false
            updater_response = price_verifier.update({
                'partition_key': 'id',
                'sort_key': 'url',
                'id': check['id'],
                'url': check['url'],
                'target': 'last_check',
                'value': 'false'
            })

            price_verifier.put(item=price_variation)

            print('Updater Response:', updater_response)
            print('New Price Item:', price_variation)

    else:
        price_variation.update({
            'year': current_date.strftime('%Y'),
            'month': current_date.strftime('%m'),
            'day': current_date.strftime('%d'),
            'last_check': 'true',
            'url': content['url'],
            'price': content['price'],
            'domain': extract_domain(content['url'])
        })

        price_verifier.put(item=price_variation)

        print('Added the URL to the Price Variation Table', price_variation)
    def _get(self, content: str, url: str) -> dict:
        """Docstring for the _head function.
        
        It gets the information from the JSON using ``eval`` python
        function.

        Note:
            This is exclusive to OLX, since it's the unique website that keep the JSON on the <head>

        Args:
            param1 (str) content: The HTML <head> content

        Returns:
            A dict with the crawled content

        """
        parser: ClassVar[Parser] = Selector(text=content)
        item: Dict[str, str] = dict()

        items: List[str] = [
            'url', 'date', 'domain', 'title', 'category', 'price', 'body',
            'rooms', 'bathrooms', 'suites', 'garages', 'features', 'city',
            'zipcode', 'neighbourhood', 'address', 'latitude', 'longitude',
            'privative_area', 'total_area', 'ground_area', 'images'
        ]

        domain: str = extract_domain(url)
        url: str = url
        date = datetime.today().strftime("%Y-%m-%d")

        head_json = eval(
            parser.xpath(self.mapping['parser_json']).extract_first().replace(
                'null', 'None').replace('true',
                                        'True').replace('false',
                                                        'False'))['ad']

        body: str = head_json['body']
        title: str = parser.xpath(self.mapping['parser_title']).extract_first()
        location: Dict[str, str] = head_json['location']

        city: str = location['municipality']
        zipcode: str = location['zipcode']
        neighbourhood: str = location['neighbourhood']
        address: str = location['address']
        latitude: str = location['mapLati']
        longitude: str = location['mapLong']
        privative_area: str = None
        total_area: str = None
        ground_area: str = None

        price: str = None

        if 'priceValue' in head_json:
            price = head_json['priceValue']

        rooms: str = None
        garages: str = None
        bathrooms: str = None
        category: str = None
        suites: str = None
        features: list = list()

        for _property in head_json['properties']:
            if _property['name'] == 'price' and not price:
                price = _property['value']

            elif _property['name'] == 'rooms':
                rooms = _property['value']

            elif _property['name'] == 'garage_spaces':
                garages = _property['value']

            elif _property['name'] == 'bathrooms':
                bathrooms = _property['value']

            elif _property['name'] == 'category':
                category = _property['value']

            elif _property['name'] == 'size':
                if _property['label'] == 'Área útil':
                    privative_area = _property['value']
                elif _property['label'] == 'Tamanho':
                    ground_area = _property['value']
                elif _property['label'] == 'Área total':
                    total_area = _property['value']
                else:
                    total_area = _property

            elif 'features' in _property['name']:
                for feature in _property['values']:
                    features.append(feature['label'])

        images: list = list()

        for image in head_json['images']:
            try:
                images.append({
                    'src': image['original'],
                    'alt': image['originalAlt']
                })

            except KeyError:
                pass

        for variable in items:
            item[variable] = eval(variable)

        print('OLX head_json["ad"]', head_json)

        print('FINAL OLX ITEM', item)

        return item
    def _get(self, content: str, url: str) -> dict:
        """Docstring for the _scrape function.

        It crawl the information on the given HTML ``content``
        using the given dict with XPaths ``mapping``. The content
        it crawls is title, price, bedrooms, and informations like that.

        Note:
            It's different from the _crawl function since the crawled content
            and the dict structure are not the same
        
        Args:
            param1 (str) content: The HTML content
        
        Returns:
            A dict with the crawled content
        """
        parser: ClassVar[Parser] = Selector(text=content)
        item: Dict[str, str] = dict()
        items: List[str] = [
            'url', 'date', 'domain', 'title', 'category', 'price', 'body',
            'rooms', 'bathrooms', 'suites', 'garages', 'features', 'city',
            'zipcode', 'neighbourhood', 'address', 'latitude', 'longitude',
            'privative_area', 'total_area', 'ground_area', 'images'
        ]

        city: str = None
        zipcode: str = None
        neighbourhood: str = None
        address: str = None
        latitude: str = None
        longitude: str = None

        body: str = parser.xpath(self.mapping['parser_body']).extract_first()
        title: str = parser.xpath(self.mapping['parser_title']).extract_first()
        category: str = parser.xpath(
            self.mapping['parser_category']).extract_first()
        price: str = parser.xpath(self.mapping['parser_price']).extract_first()
        rooms: str = parser.xpath(self.mapping['parser_rooms']).extract_first()
        suites: str = parser.xpath(
            self.mapping['parser_suites']).extract_first()
        garages: str = parser.xpath(
            self.mapping['parser_garages']).extract_first()
        bathrooms: str = parser.xpath(
            self.mapping['parser_bathrooms']).extract_first()
        privative_area: str = parser.xpath(
            self.mapping['parser_privative_area']).extract_first()
        total_area: str = parser.xpath(
            self.mapping['parser_total_area']).extract_first()
        ground_area: str = parser.xpath(
            self.mapping['parser_ground_area']).extract_first()
        location: str = parser.xpath(
            self.mapping['parser_location']).extract_first()
        features: List[str] = parser.xpath(
            self.mapping['parser_features']).extract()
        images_src: List[str] = parser.xpath(
            self.mapping['parser_images_src']).extract()
        images_alt: List[str] = parser.xpath(
            self.mapping['parser_images_alt']).extract()
        images: List[Dict[str, str]] = [{
            'src': images_src[i],
            'alt': images_alt[i]
        } for i in range(len(images_src))]
        url: str = url
        domain: str = extract_domain(url)
        date = datetime.today().strftime("%Y-%m-%d")

        if self.mapping['options_location_use_geo'] == 'true':
            latitude = parser.xpath(
                self.mapping['parser_location_latitude']).extract_first()
            longitude = parser.xpath(
                self.mapping['parser_location_longitude']).extract_first()

        elif self.mapping['options_location_use_geo'] == 'false':
            address = location

        for variable in items:
            item[variable] = eval(variable)

        return item