def first_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_list = list()
    url = root_site_url + layer_description["items_urls_and_descriptions"]

    """
    html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(html), parser)
    info_container = tree.xpath(layer_description['info_container_xpath'])
    if info_container != []:
        print 'first_layer_processor: info_container != [] ', info_container 
        urls_container = info_container[0].xpath(layer_description['info_tag_xpath'])
        for url_container in urls_container:
            next_layer_url_list.append(url_container.attrib['href'])
        next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != '#']))
        for item in next_layer_url_list:
            print item
    else:
        print 'first_layer_processor: info_container == []: ', info_container 

    """
    content = get_content(url, headers_json, post_data={})

    json_dict = json.loads(content)
    html = json_dict["d"]

    all_same_type_nodes = get_all_same_type_nodes(html, layer_description["info_container_xpath"])

    # print all_same_type_nodes

    get_attrib = generate_attrib_getter("href")
    result_list = node_list_processor(all_same_type_nodes, get_attrib)
    result_list = [item for item in result_list if item.find("subcatid") != -1]
    for result in result_list:
        print "\n" * 2, "-" * 10
        print result
        next_layer_url_list.append(result)

    next_layer_url_list = list(set([next_layer_url for next_layer_url in next_layer_url_list if next_layer_url != "#"]))
    return next_layer_url_list
Beispiel #2
0
        if self._response != None:
            if content_type == 'text':
                return self._response.text
            elif content_type == 'binary':
                return self._response.content
            elif content_type == 'json':
                return self._response.json
            elif content_type == 'raw':
                return self._response.raw
            else:
                return self._response.text
        else:
            return None


get_href_attrib = generate_attrib_getter('href')
next_layer_url_maker = lambda link_node: base_url + get_href_attrib(link_node)[1:]


def generate_second_layer_info_processor(second_layer_trade_name_xpath, second_layer_product_description_text_xpath, next_layer_link_xpath, third_layer_processor):
    def second_layer_info_processor(product_description):
        product_name = product_description.xpath(second_layer_trade_name_xpath)
        print product_name[0].text.encode('utf-8')
        #product_description_dict['Produkt'] = product_name[0].text.encode('utf-8')
        product_description_text_container = product_description.xpath(second_layer_product_description_text_xpath)
        product_description_text_subcontainer = product_description_text_container[0]
        #print 'product_description_text_subcontainer: ', product_description_text_subcontainer
        child_list = product_description_text_subcontainer.getchildren()
        product_description_text = ''
        if child_list == []:
            product_description_text = product_description_text_subcontainer.text