Esempio n. 1
0
def get_unique_search_terms(inputs_map):
    """
    Get all unique search terms for all inputs as map of search-term, list of barcodes of respective inputs
    """
    search_terms = {}
    for barcode, _input in inputs_map.items():
        #xmltodict.parse(xml)['inputs']['input'][0]['search-terms']['search-term']
        search_term_elems = get_child_elements(
            _input, './/search-terms/search-term/value')
        search_items = [
            get_value(search_term_elem)
            for search_term_elem in search_term_elems
        ]

        for st in search_items:
            st = st.encode('ascii', 'ignore').decode('utf-8')
            st = str(st).strip().replace(' ', '+')
            if st in search_terms:
                search_terms[st].add((st, 'search-term', barcode))
            else:
                search_terms[st] = set([(st, 'search-term', barcode)])

        # Add input product name to search items
        pn = _input.find('name').text.encode('ascii', 'ignore').decode('utf-8')
        pn = str(pn).strip().replace(' ', '+')
        if pn in search_terms:
            search_terms[pn].add((pn, 'product-name', barcode))
        else:
            search_terms[pn] = set([(pn, 'product-name', barcode)])

    return search_terms
Esempio n. 2
0
def map_products_inputs(site_results, inputs_map):
    """"""
    input_search_products_map = {}
    for barcode, _input in inputs_map.items():
        search_products_map = {}
        if barcode in site_results:
            search_term_elems = get_child_elements(
                _input, './/search-terms/search-term/value')
            search_items = [
                get_value(search_term_elem)
                for search_term_elem in search_term_elems
            ]
            search_products_map = dict([((convert_search_item(st),
                                          'search-term'), [])
                                        for st in search_items])
            pn = convert_search_item(_input.find('name').text)
            search_products_map[(pn, 'product-name')] = []

            details = site_results[barcode]
            for pd, sis in details:
                for si in sis:
                    st = si[0]
                    _type = si[1]
                    score = si[2]
                    row_num = si[3]
                    page_num = si[4]

                    search_products_map[(st, _type)].append(
                        (pd, score, row_num, page_num))
        input_search_products_map[barcode] = search_products_map
    return input_search_products_map
Esempio n. 3
0
def get_products_details(resp, site):
    """"""
    html = parse_html(resp.content)
    collections = site_config[site]['collections']
    prod_url = str(resp.url).strip()

    details = {'prod_url': prod_url, 'id': get_id(prod_url)}
    for collection in collections:
        _map = collection['map']
        _multiple = collection['multiple']
        #_type = collection['type']
        _value = collection['value']

        try:

            if _map == 'image-urls' and 'img_embedded' in site_config[site]:
                e = get_child_element(html, _value)
                images = []
                data_config = json.loads(get_value(e).strip())
                keys = site_config[site]['img_embedded']
                imgs = data_config[keys[0]]
                for i in range(len(imgs)):
                    images.append(site_config[site]['base_url'] +
                                  imgs[i][keys[1]][1:])
                if images:
                    details[_map] = images

            elif _multiple == 'yes':
                values = []
                for e in get_child_elements(html, _value):
                    values.append(get_value(e).strip())
                if values:
                    details[_map] = values
            else:
                e = get_child_element(html, _value)
                if e is not None:
                    details[_map] = get_value(e).strip()

            if _map == 'image-urls':
                for i in range(len(details[_map])):
                    img = details[_map][i]
                    if not img.startswith('http'):
                        details[_map][i] = site_config[site]['base_url'] + img

        except:
            pass

    return details
Esempio n. 4
0
def process_page(base_url, html, st, row_loc, prod_link, page_num, **kwargs):
    """Process search result page shown in UI."""

    html_root = parse_html(html)

    rows = get_child_elements(html_root, row_loc)

    links = set([])
    for index, row in enumerate(rows):
        row_link = get_value(get_child_element(row, prod_link))
        if not row_link.startswith('http'):
            row_url = base_url + row_link
        else:
            row_url = row_link

        site = get_site(row_url)
        if site and 'prod_end_flag' in site_config[site]:
            end = row_url.find(site_config[site]['prod_end_flag'])
            if end >= 0:
                row_url = row_url[:end]

        links.add((st, str(row_url).strip(), index, page_num))

    return links
Esempio n. 5
0
def generate_report_xml(site_results, site, inputs_map, rep_path):
    screenshots = {}
    input_search_products_map = map_products_inputs(site_results, inputs_map)

    rep_name = site_config[site]['report_name']
    cur_type = site_config[site]['currency_symbol']
    pg_size = site_config[site]['page_size']
    screenshots_dir = site_config[site]['screenshots_path']

    results_elem = create_root()
    dom_obj = create_dom(results_elem)

    for barcode, _input in inputs_map.items():
        #
        search_products_map = input_search_products_map.get(barcode, [])

        result_elem = add_child(results_elem, 'result')
        result_elem.append(_input)
        output_elem = add_child(result_elem, 'output')
        search_results_elem = add_child(output_elem, 'search-results')
        #
        search_term_elems = get_child_elements(
            _input, './/search-terms/search-term/value')
        search_items = [
            get_value(search_term_elem)
            for search_term_elem in search_term_elems
        ]
        search_items.insert(0, _input.find('name').text)

        for idx, search_item in enumerate(search_items):
            if idx == 0:
                search_result_elem = add_child(search_results_elem,
                                               'search-result',
                                               type='product-name')
            else:
                search_result_elem = add_child(search_results_elem,
                                               'search-result',
                                               type='search-term')
            search_item_elem = add_child(search_result_elem, 'search-item')
            search_item_elem.text = search_item
            found_elem = add_child(search_result_elem, 'found')
            found_elem.text = 'false'
            matches_elem = add_child(search_result_elem, 'matches')

            for search, products in search_products_map.items():
                if search[0] == convert_search_item(search_item):
                    for product, score, row_num, page_num in products:
                        matches_elem.getparent().find('found').text = 'true'
                        match_elem = add_child(matches_elem, 'match')
                        match_url_elem = add_child(match_elem, 'match-url')
                        match_url_elem.text = product['prod_url']

                        screenshot_elem = add_child(match_elem, 'screenshot')
                        #if 'screenshot_loc' in product:
                        #screenshot_elem.text = product['screenshot_loc']
                        img_name = _input.find(
                            'name').text + '_' + search_item + '_' + str(
                                page_num) + '_' + str(row_num) + '.PNG'
                        screenshot_loc = os.path.join(rep_path,
                                                      screenshots_dir,
                                                      img_name)
                        screenshot_elem.text = screenshot_loc

                        if not product['prod_url'] in screenshots:
                            screenshots[product['prod_url']] = []
                        screenshots[product['prod_url']].append(screenshot_loc)

                        categories_elem = add_child(match_elem, 'categories')
                        if 'categories' in product:
                            for category in product['categories']:
                                category_elem = add_child(
                                    categories_elem, 'category')
                                cat_val_elem = add_child(
                                    category_elem, 'value')
                                cat_val_elem.text = category

                        position_elem = add_child(match_elem, 'position')
                        row_index_elem = add_child(position_elem,
                                                   'index-on-page')
                        row_index_elem.text = str(row_num)
                        pg_no_elem = add_child(position_elem, 'page-number')
                        pg_no_elem.text = str(page_num)
                        pg_size_elem = add_child(position_elem, 'page-size')
                        pg_size_elem.text = str(pg_size)

                        prices_elem = add_child(match_elem, 'prices')
                        selling_price_elem = add_child(prices_elem,
                                                       'selling_price')
                        selling_price_cur = add_child(selling_price_elem,
                                                      'currency')
                        if 'currency' in product:
                            selling_price_cur.text = product['currency']
                        else:
                            selling_price_cur.text = cur_type
                        selling_price_val = add_child(selling_price_elem,
                                                      'value')
                        if 'selling_price' in product:
                            selling_price_val.text = product['selling_price']
                        standard_price_elem = add_child(
                            prices_elem, 'standard_price')
                        standard_price_cur = add_child(standard_price_elem,
                                                       'currency')
                        standard_price_cur.text = cur_type
                        standard_price_val = add_child(standard_price_elem,
                                                       'value')
                        if 'standard_price' in product:
                            standard_price_val.text = product['standard_price']

                        score_elem = add_child(match_elem, 'score')
                        score_elem.text = str(score)

                        data_elem = add_child(match_elem, 'data')
                        prod_summ_elem = add_child(data_elem,
                                                   'product-summary')
                        for prod_summary_item in filter(
                                lambda col: col['type'] == 'product_summary',
                                site_config[site]['collections']):
                            if prod_summary_item['map'] in product:
                                if prod_summary_item['multiple'] == 'yes':
                                    level1_elem = add_child(
                                        prod_summ_elem,
                                        prod_summary_item['map'])
                                    for el in product[
                                            prod_summary_item['map']]:
                                        child_elem = add_child(
                                            level1_elem,
                                            prod_summary_item['map'][:-1])
                                        child_elem_value = add_child(
                                            child_elem, 'value')
                                        child_elem_value.text = el
                                else:
                                    child_elem = add_child(
                                        prod_summ_elem,
                                        prod_summary_item['map'])
                                    data_match_elem = add_child(
                                        child_elem, 'data-match')
                                    data_match_found = add_child(
                                        data_match_elem, 'found')
                                    data_match_found.text = 'true'
                                    data_match_score = add_child(
                                        data_match_elem, 'score')
                                    data_match_score.text = str(score)
                                    child_elem_value = add_child(
                                        child_elem, 'value')
                                    child_elem_value.text = product[
                                        prod_summary_item['map']]

                        attributes_elem = add_child(data_elem, 'attributes')
                        attributes_elem.text = ''

    create_dir(rep_path)
    save_dom(dom_obj, os.path.join(rep_path, rep_name))
    return screenshots