Example #1
0
def parse_single_price(path, rule):
    """
    for item page, to parse price information
    :param path: path of the raw data file
    :param rule: rule for parsing
    :return: price
    """
    with open(path, 'rb') as f:
        content = f.read()
    encoding = chardet.detect(content)['encoding']
    try:
        content = content.decode(encoding)
    except UnicodeDecodeError:
        try:
            content = content.decode('utf-8')
        except Exception as e:
            LOGGER.warning('url pattern: ' + str(e))
            return None
    except Exception as e:
        LOGGER.warning('url pattern: ' + str(e))
        return None

    doc = pq(content)
    price = doc(rule['selector']).filter(
        lambda x, this: re.compile(rule['filter_re']).match(
            pq(this).attr(rule['filter_attr']) if rule['filter_in_attr'] else
            pq(this).html())) if rule['filter'] else doc(rule['selector'])
    price = price.children() if rule['children'] else price
    if not price:
        return None
    price = pq(price).attr(
        rule['attr']) if rule['in_attr'] else pq(price).text()
    price, currency = price_formatter(price)
    return price
def microdata_filter(site_id):
    products = []
    schema_product_type = 'http://schema.org/Product'

    data_file_path = config.URL_CRAWLED_DATA_DIR + str(site_id)
    if not os.path.exists(data_file_path):
        return False, None, None, None

    with open(data_file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']
        items = microdata.get_items(f, encoding)
    if not items:
        return False, None, None, None

    for item in items:
        item = json.loads(item.json())
        if item.get('type')[0] == schema_product_type and item.get(
                'properties').get('offers'):
            product_price = None
            product_currency = None
            try:
                product_price = item.get('properties').get('offers')[0].get(
                    'properties').get('price')[0]
            except Exception as e:
                print(e)
            try:
                product_currency = item.get('properties').get('offers')[0].get(
                    'properties').get('priceCurrency')[0]
            except Exception as e:
                print(e)

            if product_price:
                product = {
                    'price':
                    price_formatter(product_price)[0]
                    if product_price else None,
                    'currency':
                    product_currency
                }
                products.append(product)

    if len(products) == 0:
        return False, None, None, None
    else:
        product = products[0]
        return True, product.get('price'), product.get(
            'currency'), datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
Example #3
0
def parse_price(file, containers, selectors):
    """
    parse price
    :param file: raw data file path
    :param containers: containers in price_tag_selectors.json
    :param selectors: selectors in price_tag_selectors.json
    :return: True if error, else False
    :return: list of price information,
             [{'method': 'tag pattern', 'price', 'currency', 'rule': {'selector', 'weight'}, 'event_time'}, ...]
    :return: selectors failed to parse price
    """
    with open(file, 'rb') as f:
        content = f.read()
    encoding = chardet.detect(content)['encoding']
    try:
        content = content.decode(encoding)
    except UnicodeDecodeError as e:
        try:
            content = content.decode('utf-8')
        except Exception as e:
            print('[FAIL] tag pattern:', e)
            LOGGER.warning('tag pattern: ' + str(e))
            return True, None, None
    except Exception as e:
        print('[FAIL] tag pattern:', e)
        LOGGER.warning('tag pattern: ' + str(e))
        return True, None, None

    try:
        doc = pq(content)
    except Exception as e:
        LOGGER.warning('tag pattern: ' + str(e))
        return True, None, None
    price_list = []
    selector_list = []
    for tag in selectors:
        prices = doc(tag.get('selector'))
        success_flag = False
        if prices:
            if prices.size() == 1:
                container = re.match(r'(<)(\w+)(\s*|/>)(\.*)',
                                     str(prices)).group(2)
                if container and container in containers:
                    price, currency = price_formatter(prices.text())
                    if price and price != '' and float(price) != 0:
                        price_list.append({
                            'method':
                            'tag pattern',
                            'price':
                            price,
                            'currency':
                            currency,
                            'rule': {
                                'selector': tag.get('selector'),
                                'weight': tag.get('weight'),
                            },
                            'event_time':
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S')
                        })
                        success_flag = True
            elif prices.size() > 1:
                i = 0
                while i < prices.size():
                    price = prices.eq(i)
                    container = re.match(r'(<)(\w+)(\s*|/>)(\.*)',
                                         str(price)).group(2)
                    if container and container in containers:
                        price, currency = price_formatter(price.text())
                        if price and price != '' and float(price) != 0:
                            price_list.append({
                                'method':
                                'tag pattern',
                                'price':
                                price,
                                'currency':
                                currency,
                                'rule': {
                                    'selector': tag.get('selector'),
                                    'weight': tag.get('weight'),
                                },
                                'event_time':
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                            })
                            success_flag = True
                            break
                    i += 1
        if not success_flag:
            selector_list.append(tag)
    return False, price_list, selector_list
Example #4
0
def parse_list(path, rule):
    """
    for list page, to parse name and price information
    :param path: path of the raw data file
    :param rule: rule for parsing
    :return: ret, list of information
    """
    with open(path, 'rb') as f:
        content = f.read()
    encoding = chardet.detect(content)['encoding']
    try:
        content = content.decode(encoding)
    except UnicodeDecodeError:
        try:
            content = content.decode('utf-8')
        except Exception as e:
            # print('[FAIL] url pattern:', e)
            LOGGER.warning('url pattern: ' + str(e))
            return None
    except Exception as e:
        # print('[FAIL] url pattern:', e)
        LOGGER.warning('url pattern: ' + str(e))
        return None

    doc = pq(content)

    list_exist = doc(rule['selector'])
    if not list_exist:
        return None

    ret = []
    items = doc(rule['item_selector'])
    for i in range(0, items.size()):
        item = items.eq(i)
        description = None
        price = None
        # Parse description
        for it in item.items(rule['item_description']['selector']):
            description = it.eq(0)
        if description:
            description = description.filter(lambda x, this: re.compile(rule[
                'item_description']['filter_re']).match(
                    pq(this).attr(rule['item_description']['filter_attr'])
                    if rule['item_price']['filter_in_attr'] else pq(this).html(
                    ))) if rule['item_description']['filter'] else description
            description = description.children(
            ) if rule['item_description']['children'] else description
            if description:
                description = pq(description).attr(rule['item_description']['attr']) \
                    if rule['item_description']['in_attr'] else description.text()
        # Parse price
        for it in item.items(rule['item_price']['selector']):
            price = it.eq(0)
        if price:
            price = price.filter(lambda x, this: re.compile(rule['item_price'][
                'filter_re']).match(
                    pq(this).attr(rule['item_price']['filter_attr'])
                    if rule['item_price']['filter_in_attr'] else pq(this).html(
                    ))) if rule['item_price']['filter'] else price
            price = price.children(
            ) if rule['item_price']['children'] else price
            if price:
                price = pq(price).attr(rule['item_price']['attr']) \
                    if rule['item_price']['in_attr'] else pq(price).text()
                price, currency = price_formatter(price)
        ret.append({'name': description, 'price': price})

    return ret
def microdata_filter(site):
    """
    filter site contains microdata
    follows the schema in 'http://schema.org/Product'
    :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'}
    :return: success_flag, currency, price
    """
    success_flag = False
    if not check_site_by_id(site.get('site_id')):
        return success_flag, None, None, None

    products = []
    schema_product_type = 'http://schema.org/Product'

    data_file_path = config.URL_CRAWLED_DATA_DIR + str(site.get('site_id'))
    new_data_file_path = config.URL_CRAWLED_DATA_DIR + str(
        site.get('site_id')) + '_new'
    if not os.path.exists(new_data_file_path) and not os.path.exists(
            data_file_path):
        LOGGER.warning('microdata: cannot crawl data from this url')
        return False, None, None, None

    items = None
    if os.path.exists(new_data_file_path):
        with open(new_data_file_path, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
            items = microdata.get_items(f, encoding)
    if not items:
        if os.path.exists(new_data_file_path) and os.path.exists(
                data_file_path):
            os.remove(new_data_file_path)
        if os.path.exists(data_file_path):
            with open(data_file_path, 'rb') as f:
                encoding = chardet.detect(f.read())['encoding']
                items = microdata.get_items(f, encoding)
    else:
        if os.path.exists(data_file_path):
            os.remove(data_file_path)
        os.rename(new_data_file_path, data_file_path)

    for item in items:
        item = json.loads(item.json())
        if item.get('type')[0] == schema_product_type and item.get(
                'properties').get('offers'):
            success_flag = True
            product_name = None
            product_price = None
            product_currency = None
            try:
                product_name = item.get('properties').get('name')[0]
            except Exception as e:
                LOGGER.warning('microdata: ' + str(e))
            try:
                product_price = item.get('properties').get('offers')[0].get(
                    'properties').get('price')[0]
            except Exception as e:
                LOGGER.warning('microdata: ' + str(e))
            try:
                product_currency = item.get('properties').get('offers')[0].get(
                    'properties').get('priceCurrency')[0]
            except Exception as e:
                LOGGER.warning('microdata: ' + str(e))

            if product_price:
                product = {
                    'name':
                    product_name,
                    'price':
                    price_formatter(product_price)[0]
                    if product_price else None,
                    'currency':
                    product_currency
                }
                products.append(product)

    if len(products) == 0:
        LOGGER.info('[FAIL] microdata: not found')
        return success_flag, None, None, None
    elif len(products) == 1:
        product = products[0]
    else:
        product = find_best_match(products, site.get('product_name'))

    LOGGER.info('[RESULT] microdata: ' + str(product.get('currency')) + ' ' +
                str(product.get('price')))
    return success_flag, product.get('price'), product.get('currency'), \
           datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')