def fourth_layer_media_processor(url, img_xpath, venue_id, exceptions_log_file):
    media_url = '%s/media' % url
    html = get_html(media_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
    tree = etree.parse(StringIO(html), parser)
    img_container = tree.xpath(img_xpath)
    img_url_list = [img_tag.attrib['src'] for img_tag in img_container]
    img_num = 0
    for img_url in img_url_list:
        img_file_name = '%s_%s' % (str(venue_id), str(img_num))
        img_num = img_num + 1
        img_download(img_url, img_file_name, img_dir, exceptions_log_file)
    return img_url_list
def third_layer_data_slice_processor(args):
    layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args
    result_dict = dict()
    venue_id = start_venue_id
    for url_local_part in urls_list_slice:
        url = root_site_url + '/' + url_local_part
        venue_description = dict()
        venue_description['id'] = venue_id

        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        #html = get_saved_content('fundays_third_layer_requests.txt')
        parser = etree.HTMLParser(remove_blank_text=True, strip_cdata=False)
        tree = etree.parse(StringIO(html), parser)
        #tree.write('fundays_third_layer_pretty.html', pretty_print=True)

        info_container_list = tree.xpath(layer_description['info_container_xpath'])
        #print info_container_list
        
        #address_node = info_container_list[0].xpath('//div[@itemprop="address"]')[0]
        #print get_xpath_to_root(address_node)

        print 'info_container_list: ', info_container_list
        if info_container_list != []:
            info_container = info_container_list[0]
            address_node_container = info_container.xpath(layer_description['address_node_xpath'])
            advert_text = ''
            if address_node_container != []:
                address_node = address_node_container[0]
                #print address_node
                if address_node.tail != None:
                    advert_text = advert_text + address_node.tail + ' '
                current_advert_node = address_node.getnext()
                while(current_advert_node != None and current_advert_node.tag != 'h3'):
                    if current_advert_node.text != None:
                        advert_text = advert_text + current_advert_node.text + ' '
                    if current_advert_node.tail != None:
                        advert_text = advert_text + current_advert_node.tail + ' '
                    strong_subnode_container = current_advert_node.xpath('strong')
                    if strong_subnode_container != []:
                        for strong_subnode in strong_subnode_container:
                            if strong_subnode.text != None:
                                advert_text = advert_text + strong_subnode.text + ' '
                            if strong_subnode.tail != None:
                                advert_text = advert_text + strong_subnode.tail + ' '
                    current_advert_node = current_advert_node.getnext()

                #print advert_text

                venue_description['advert_text'] = advert_text
                        
            contacts_header_list = info_container.xpath(layer_description['contacts_header_xpath'])
            for contacts_header in contacts_header_list:
                if 'Contact' in contacts_header.text:
                    #print contacts_header.text[len('Contact') + 1:]
                    venue_description['name'] = contacts_header.text[len('Contact') + 1:]
            phone_container = info_container.xpath(layer_description['phone_xpath'])
            if phone_container != []:
                phone = phone_container[0].text
                #print phone
                venue_description['phone'] = phone
            info_header_tags = info_container.xpath(layer_description['info_header_tag_xpath'])
            for info_header_tag in info_header_tags:
                header_text = info_header_tag.text.strip()
                if header_text == 'International:':
                    international_phone = info_header_tag.tail.strip()
                    #print international_phone
                    venue_description['international_phone'] = international_phone
                if header_text == 'Email:':
                    email = get_next_text(info_header_tag)
                    #print email
                    venue_description['email'] = email
                if header_text == 'Website:':
                    site = get_next_href(info_header_tag)
                    #print site
                    venue_description['site'] = site
                if header_text == 'Location Map:':
                    location_link = get_next_href(info_header_tag)
                    if location_link != None:
                        location_coordinates = location_link[location_link.find('=') + 1:].split(',')
                        #print location_coordinates
                        venue_description['location_coordinates'] = location_coordinates
            region_container = info_container.xpath(layer_description['region_address_xpath'])
            if region_container != []:
                venue_description['region'] = ''
                if region_container[0].text != None:
                    region = region_container[0].text.strip()
                    venue_description['region'] = region
                    #print region
            street_address_container = info_container.xpath(layer_description['street_address_xpath'])
            if street_address_container != []:
                street_address = ''
                if street_address_container[0].text != None:
                    street_address = street_address_container[0].text.strip()
                    #print street_address
                    venue_description['street_address'] = street_address

        list_detail_container = tree.xpath(layer_description['list_detail_xpath'])

        img_container_list = tree.xpath(layer_description['img_xpath'])
        print 'img_container_list: ', img_container_list
        img_num = 0
        img_list = list()
        for img_container in img_container_list:
            print 'img_container: ', img_container
            if img_container != None:
                img_url = img_container.attrib.get('src')
                #print img_url
                if img_url != None and img_url != '':
                    img_list.append(img_url)
                    img_file_name = '%s_%s' % (str(venue_id), str(img_num))
                    img_num = img_num + 1
                    img_download(root_site_url + img_url, img_file_name, img_dir, exceptions_log_file)
        venue_description['img_urls'] = img_list
        #items_urls_and_descriptions[url_local_part]['description'] = venue_description

        categories = items_urls_and_descriptions[url_local_part]['categories']
        #categories = {}

        #items_urls_and_descriptions[url_local_part].update({'description' : venue_description})
        #items_urls_and_descriptions[url_local_part] = {'categories' : categories, 'description' : venue_description}
        result_dict[url_local_part] = {'categories' : categories, 'description' : venue_description}
        #print venue_description
        venue_id = venue_id + 1
        print start_venue_id
        print result_dict[url_local_part]
    #return items_urls_and_descriptions
    #queue.put(result_dict)
    return result_dict
def third_layer_data_slice_processor(args):
    result_dict = dict()
    layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args
    venue_id = start_venue_id
    for local_url_part in urls_list_slice:
        full_description = dict()
        full_description["id"] = venue_id
        url = root_site_url + local_url_part
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)

        contacts_container = tree.xpath(layer_description["contacts_container_xpath"])
        name_container = tree.xpath(layer_description["name_xpath"])
        if name_container != []:
            name_node = name_container[0]
            full_description["name"] = name_node.text
        if contacts_container != []:
            print contacts_container
            contacts_container_node = contacts_container[0]
            address_components = contacts_container_node.xpath(layer_description["address_components_xpath"])
            address = ""
            if address_components != []:
                for address_component in address_components:
                    address = address + address_component.text + " "
            print address
            full_description["address"] = address
            phone_node_container = contacts_container_node.xpath(layer_description["phone_xpath"])
            full_description["phone"] = ""
            if phone_node_container != []:
                phone_node = phone_node_container[0]
                print phone_node.text
                full_description["phone"] = phone_node.text
            email_node_container = contacts_container_node.xpath(layer_description["email_xpath"])
            full_description["email"] = ""
            if email_node_container != []:
                email_node = email_node_container[0]
                print email_node.text
                full_description["email"] = email_node.text
            web_node_container = contacts_container_node.xpath(layer_description["web_xpath"])
            full_description["web"] = ""
            if web_node_container != []:
                web_node = web_node_container[0]
                print web_node.text
                full_description["web"] = web_node.text
            text = ""
            # text_node_container = contacts_container_node.xpath(layer_description['text_xpath'])
        text_node_container = tree.xpath(layer_description["text_xpath"])
        if text_node_container != []:
            text_node = text_node_container[0]
            text = text = text_node.text
            for text_part_node in text_node.xpath(layer_description["text_other_parts_xpath"]):
                if text_part_node.tail != None:
                    text = text + " " + text_part_node.tail
            print text
            full_description["text"] = text
        social_shares_container = tree.xpath(layer_description["social_shares_container_xpath"])
        if social_shares_container != []:
            social_shares_node = social_shares_container[0]
            facebook_container = social_shares_node.xpath(layer_description["facebook_xpath"])
            if facebook_container != []:
                print facebook_container[0].attrib["href"]
                full_description["facebook_link"] = facebook_container[0].attrib["href"]
            twitter_container = social_shares_node.xpath(layer_description["twitter_xpath"])
            if twitter_container != []:
                print twitter_container[0].attrib["href"]
                full_description["twitter_link"] = twitter_container[0].attrib["href"]
            google_container = social_shares_node.xpath(layer_description["google_xpath"])
            if google_container != []:
                print google_container[0].attrib["href"]
                full_description["gplus_link"] = google_container[0].attrib["href"]
                """
        location_container = tree.xpath(layer_description['location_container_xpath'])
        if location_container != []:
            location_node = location_container[0]
            latitude_container = location_node.xpath(layer_description['latitude_xpath'])
            if latitude_container != []:
                latitude_value = latitude_container[0].get('value')
                if latitude_value != None:
                    print latitude_value
                    full_description['latitude'] = latitude_value
            longitude_container = location_node.xpath(layer_description['longitude_xpath'])
            if longitude_container != []:
                longitude_value = longitude_container[0].get('value')
                if longitude_value != None:
                    print longitude_value
                    full_description['longitude'] = longitude_value
                    """
        latitude_container = tree.xpath(layer_description["latitude_xpath"])
        if latitude_container != []:
            latitude_value = latitude_container[0].get("value")
            if latitude_value != None:
                print latitude_value
                full_description["latitude"] = latitude_value
        longitude_container = tree.xpath(layer_description["longitude_xpath"])
        if longitude_container != []:
            longitude_value = longitude_container[0].get("value")
            if longitude_value != None:
                print longitude_value
                full_description["longitude"] = longitude_value

        img_container_list = tree.xpath(layer_description["img_container_xpath"])
        if img_container_list != []:
            img_container = img_container_list[0]
            img_num = 0
            img_list = list()
            for img_node in img_container.xpath(layer_description["img_node_xpath"]):
                img_url = img_node.attrib.get("full")
                print img_url
                if img_url != None:
                    if img_url != "":
                        img_list.append(img_url)
                        img_file_name = "%s_%s" % (str(venue_id), str(img_num))
                        img_num = img_num + 1
                        img_download(img_url, img_file_name, img_dir, exceptions_log_file)
            full_description["img_urls"] = img_list

        short_description = items_urls_and_descriptions[local_url_part]["short_description"]
        result_dict[local_url_part] = {"short_description": short_description, "full_description": full_description}
        # print items_urls_and_descriptions[local_url_part]
        venue_id = venue_id + 1
    return result_dict