Ejemplo n.º 1
0
def crawl_data(url, site_id, max_retry_times):
    """
    crawl and save url raw data in local
    :param url: url
    :param site_id: site_id as file name
    :param max_retry_times: max retry times
    :return: True if success, else False
    """
    success_flag = False
    if not check_site_by_id(site_id):
        return success_flag

    current_retry_count = 0
    while current_retry_count < max_retry_times:
        try:
            req_headers = {
                'User-Agent':
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
            }
            req = urllib.request.Request(url, headers=req_headers)
            response = urllib.request.urlopen(req, timeout=60)
            content = response.read()
            # Save to file
            data_file_path = config.URL_CRAWLED_DATA_DIR + str(
                site_id) + '_new'
            with open(data_file_path, 'wb') as f:
                f.write(content)
            # update last indexed time in DB(haystack/site)
            engine = mysql_engine()
            db_session = sessionmaker(bind=engine)
            session = db_session()
            session.query(Site).filter(Site.site_id == site_id).update({
                Site.last_indexed:
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            })
            session.commit()
            session.close()
            success_flag = True
            break
        except urllib.error.HTTPError as e:
            print(e.code, e.reason)
            current_retry_count += 1
            print('Retry:', current_retry_count, '/', max_retry_times)
            continue
        except urllib.error.URLError as e:
            print(e.reason)
            current_retry_count += 1
            print('Retry:', current_retry_count, '/', max_retry_times)
            continue
        except ConnectionResetError as e:
            print('ConnectionResetError')
            time.sleep(random.uniform(0, 2))
            current_retry_count += 1
            print('Retry:', current_retry_count, '/', max_retry_times)
            continue
        except Exception as e:
            print('Unexpected exception:', str(e))
            break
    return success_flag
def magic_analyzer(site, image_urls):
    """
    analyse url in some magic ways, such as ML
    :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'}
    :param image_urls: list
    :return: [{'method': 'url pattern', 'price', 'currency', 'rule': {'selector', 'weight'}, 'event_time'}]
    """
    if not check_site_by_id(site.get('site_id')):
        return None

    try:
        res = MagicHayStack.get_price(site.get('url'), image_urls)
    except Exception as e:
        LOGGER.error(str(e))

    if not res:
        LOGGER.info('[FAIL] magic: price not found')
        return None
    else:
        p = []
        for r in res:
            if r[0]:
                p.append({
                    'method':
                    'magic',
                    'price':
                    r[0],
                    'currency':
                    r[1],
                    'rule': {
                        'selector': 'magic',
                        'weight': r[2]
                    },
                    'event_time':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
        if not p:
            LOGGER.info('[FAIL] magic: price not found')
            return None
        else:
            LOGGER.info('[SUCCESS] magic: ' + str(p))
            return p
Ejemplo n.º 3
0
def get_price(site, rule):
    """
    get price according to tag selectors
    :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'}
    :param rule: rule of price_tag_selector.json
    :return: list of price information,
             [{'method': 'tag pattern', 'price', 'rule': {'selector', 'weight'}, 'event_time'}, ...]
    """
    error_flag = False
    price_list = []
    if not check_site_by_id(site.get('site_id')):
        return True, None
    raw_data_path = config.URL_CRAWLED_DATA_DIR
    data_file_path = raw_data_path + str(site.get('site_id'))
    new_data_file_path = raw_data_path + str(site.get('site_id')) + '_new'
    if not os.path.exists(new_data_file_path) and not os.path.exists(
            data_file_path):
        LOGGER.warning('tag pattern: cannot crawl data from this url')
        return True, None

    if os.path.exists(new_data_file_path):
        error_flag, price_list, selector_list = parse_price(
            new_data_file_path, rule.get('containers'), rule.get('selectors'))
    else:
        selector_list = rule.get('selectors')

    if selector_list and len(selector_list) > 0:
        if os.path.exists(data_file_path):
            error_flag, price_list2, selector_list2 = parse_price(
                data_file_path, rule.get('containers'), selector_list)
            if price_list2:
                price_list += price_list2

    if price_list and len(price_list) > 0:
        LOGGER.info('[SUCCESS] tag pattern: ' + str(price_list))
        return error_flag, price_list
    else:
        LOGGER.info('[FAIL] tag pattern: no tag matched')
        return error_flag, None
Ejemplo n.º 4
0
def get_price(site, rule):
    """
    main function to parse an url
    :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'}
    :param rule: rule in url_profiles.json
    """
    if not check_site_by_id(site.get('site_id')):
        return False, None, None, None
    raw_data_path = config.URL_CRAWLED_DATA_DIR
    data_file_path = raw_data_path + str(site.get('site_id'))
    new_data_file_path = raw_data_path + str(site.get('site_id')) + '_new'
    if not os.path.exists(new_data_file_path) and not os.path.exists(
            data_file_path):
        LOGGER.warning('url pattern: cannot crawl data from this url')
        return False, None, None, None

    # parse url
    ret = None
    price = None
    site_type = None

    is_list_empty = (True if len(rule['list']) == 0 else False)
    if not is_list_empty:
        if os.path.exists(new_data_file_path):
            for l in rule['list']:
                ret = parse_list(new_data_file_path, l)
                if ret:
                    site_type = l['type']
                    if os.path.exists(data_file_path):
                        os.remove(data_file_path)
                    os.rename(new_data_file_path, data_file_path)
                    break
            if not ret and os.path.exists(data_file_path):
                os.remove(new_data_file_path)
                for l in rule['list']:
                    ret = parse_list(data_file_path, l)
                    if ret:
                        break
        if ret:
            temp = find_best_match(ret, site.get('product_name'))
            price = temp.get('price')
    if not ret or is_list_empty:
        site_type = rule['item']['type']
        if os.path.exists(new_data_file_path):
            price = parse_single_price(new_data_file_path,
                                       rule['item']['price'])
            if not price:
                if os.path.exists(data_file_path):
                    os.remove(new_data_file_path)
                    price = parse_single_price(data_file_path,
                                               rule['item']['price'])
            else:
                if os.path.exists(data_file_path):
                    os.remove(data_file_path)
                os.rename(new_data_file_path, data_file_path)
        elif os.path.exists(data_file_path):
            price = parse_single_price(data_file_path, rule['item']['price'])

    if not price:
        return False, None, None, None
    else:

        site_country = rule['country']
        update_site(site.get('site_id'), site_type, site_country)
        return (True, price, rule['currency'],
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
Ejemplo n.º 5
0
def microdata_filter(site):
    """
    filter site contains microdata
    follows the schema in 'http://schema.org/Product'
    :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'}
    :return: success_flag, currency, price
    """
    success_flag = False
    if not check_site_by_id(site.get('site_id')):
        return success_flag, None, None, None

    products = []
    schema_product_type = 'http://schema.org/Product'

    data_file_path = config.URL_CRAWLED_DATA_DIR + str(site.get('site_id'))
    new_data_file_path = config.URL_CRAWLED_DATA_DIR + str(
        site.get('site_id')) + '_new'
    if not os.path.exists(new_data_file_path) and not os.path.exists(
            data_file_path):
        LOGGER.warning('microdata: cannot crawl data from this url')
        return False, None, None, None

    items = None
    if os.path.exists(new_data_file_path):
        with open(new_data_file_path, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
            items = microdata.get_items(f, encoding)
    if not items:
        if os.path.exists(new_data_file_path) and os.path.exists(
                data_file_path):
            os.remove(new_data_file_path)
        if os.path.exists(data_file_path):
            with open(data_file_path, 'rb') as f:
                encoding = chardet.detect(f.read())['encoding']
                items = microdata.get_items(f, encoding)
    else:
        if os.path.exists(data_file_path):
            os.remove(data_file_path)
        os.rename(new_data_file_path, data_file_path)

    for item in items:
        item = json.loads(item.json())
        if item.get('type')[0] == schema_product_type and item.get(
                'properties').get('offers'):
            success_flag = True
            product_name = None
            product_price = None
            product_currency = None
            try:
                product_name = item.get('properties').get('name')[0]
            except Exception as e:
                LOGGER.warning('microdata: ' + str(e))
            try:
                product_price = item.get('properties').get('offers')[0].get(
                    'properties').get('price')[0]
            except Exception as e:
                LOGGER.warning('microdata: ' + str(e))
            try:
                product_currency = item.get('properties').get('offers')[0].get(
                    'properties').get('priceCurrency')[0]
            except Exception as e:
                LOGGER.warning('microdata: ' + str(e))

            if product_price:
                product = {
                    'name':
                    product_name,
                    'price':
                    price_formatter(product_price)[0]
                    if product_price else None,
                    'currency':
                    product_currency
                }
                products.append(product)

    if len(products) == 0:
        LOGGER.info('[FAIL] microdata: not found')
        return success_flag, None, None, None
    elif len(products) == 1:
        product = products[0]
    else:
        product = find_best_match(products, site.get('product_name'))

    LOGGER.info('[RESULT] microdata: ' + str(product.get('currency')) + ' ' +
                str(product.get('price')))
    return success_flag, product.get('price'), product.get('currency'), \
           datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')