def fourth_layer_media_processor(url, img_xpath, venue_id, exceptions_log_file): media_url = '%s/media' % url html = get_html(media_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) tree = etree.parse(StringIO(html), parser) img_container = tree.xpath(img_xpath) img_url_list = [img_tag.attrib['src'] for img_tag in img_container] img_num = 0 for img_url in img_url_list: img_file_name = '%s_%s' % (str(venue_id), str(img_num)) img_num = img_num + 1 img_download(img_url, img_file_name, img_dir, exceptions_log_file) return img_url_list
def third_layer_data_slice_processor(args): layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args result_dict = dict() venue_id = start_venue_id for url_local_part in urls_list_slice: url = root_site_url + '/' + url_local_part venue_description = dict() venue_description['id'] = venue_id html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) #html = get_saved_content('fundays_third_layer_requests.txt') parser = etree.HTMLParser(remove_blank_text=True, strip_cdata=False) tree = etree.parse(StringIO(html), parser) #tree.write('fundays_third_layer_pretty.html', pretty_print=True) info_container_list = tree.xpath(layer_description['info_container_xpath']) #print info_container_list #address_node = info_container_list[0].xpath('//div[@itemprop="address"]')[0] #print get_xpath_to_root(address_node) print 'info_container_list: ', info_container_list if info_container_list != []: info_container = info_container_list[0] address_node_container = info_container.xpath(layer_description['address_node_xpath']) advert_text = '' if address_node_container != []: address_node = address_node_container[0] #print address_node if address_node.tail != None: advert_text = advert_text + address_node.tail + ' ' current_advert_node = address_node.getnext() while(current_advert_node != None and current_advert_node.tag != 'h3'): if current_advert_node.text != None: advert_text = advert_text + current_advert_node.text + ' ' if current_advert_node.tail != None: advert_text = advert_text + current_advert_node.tail + ' ' strong_subnode_container = current_advert_node.xpath('strong') if strong_subnode_container != []: for strong_subnode in strong_subnode_container: if strong_subnode.text != None: advert_text = advert_text + strong_subnode.text + ' ' if strong_subnode.tail != None: advert_text = advert_text + strong_subnode.tail + ' ' current_advert_node = current_advert_node.getnext() #print advert_text venue_description['advert_text'] = advert_text contacts_header_list = info_container.xpath(layer_description['contacts_header_xpath']) for contacts_header in contacts_header_list: if 'Contact' in contacts_header.text: #print contacts_header.text[len('Contact') + 1:] venue_description['name'] = contacts_header.text[len('Contact') + 1:] phone_container = info_container.xpath(layer_description['phone_xpath']) if phone_container != []: phone = phone_container[0].text #print phone venue_description['phone'] = phone info_header_tags = info_container.xpath(layer_description['info_header_tag_xpath']) for info_header_tag in info_header_tags: header_text = info_header_tag.text.strip() if header_text == 'International:': international_phone = info_header_tag.tail.strip() #print international_phone venue_description['international_phone'] = international_phone if header_text == 'Email:': email = get_next_text(info_header_tag) #print email venue_description['email'] = email if header_text == 'Website:': site = get_next_href(info_header_tag) #print site venue_description['site'] = site if header_text == 'Location Map:': location_link = get_next_href(info_header_tag) if location_link != None: location_coordinates = location_link[location_link.find('=') + 1:].split(',') #print location_coordinates venue_description['location_coordinates'] = location_coordinates region_container = info_container.xpath(layer_description['region_address_xpath']) if region_container != []: venue_description['region'] = '' if region_container[0].text != None: region = region_container[0].text.strip() venue_description['region'] = region #print region street_address_container = info_container.xpath(layer_description['street_address_xpath']) if street_address_container != []: street_address = '' if street_address_container[0].text != None: street_address = street_address_container[0].text.strip() #print street_address venue_description['street_address'] = street_address list_detail_container = tree.xpath(layer_description['list_detail_xpath']) img_container_list = tree.xpath(layer_description['img_xpath']) print 'img_container_list: ', img_container_list img_num = 0 img_list = list() for img_container in img_container_list: print 'img_container: ', img_container if img_container != None: img_url = img_container.attrib.get('src') #print img_url if img_url != None and img_url != '': img_list.append(img_url) img_file_name = '%s_%s' % (str(venue_id), str(img_num)) img_num = img_num + 1 img_download(root_site_url + img_url, img_file_name, img_dir, exceptions_log_file) venue_description['img_urls'] = img_list #items_urls_and_descriptions[url_local_part]['description'] = venue_description categories = items_urls_and_descriptions[url_local_part]['categories'] #categories = {} #items_urls_and_descriptions[url_local_part].update({'description' : venue_description}) #items_urls_and_descriptions[url_local_part] = {'categories' : categories, 'description' : venue_description} result_dict[url_local_part] = {'categories' : categories, 'description' : venue_description} #print venue_description venue_id = venue_id + 1 print start_venue_id print result_dict[url_local_part] #return items_urls_and_descriptions #queue.put(result_dict) return result_dict
def third_layer_data_slice_processor(args): result_dict = dict() layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args venue_id = start_venue_id for local_url_part in urls_list_slice: full_description = dict() full_description["id"] = venue_id url = root_site_url + local_url_part html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) contacts_container = tree.xpath(layer_description["contacts_container_xpath"]) name_container = tree.xpath(layer_description["name_xpath"]) if name_container != []: name_node = name_container[0] full_description["name"] = name_node.text if contacts_container != []: print contacts_container contacts_container_node = contacts_container[0] address_components = contacts_container_node.xpath(layer_description["address_components_xpath"]) address = "" if address_components != []: for address_component in address_components: address = address + address_component.text + " " print address full_description["address"] = address phone_node_container = contacts_container_node.xpath(layer_description["phone_xpath"]) full_description["phone"] = "" if phone_node_container != []: phone_node = phone_node_container[0] print phone_node.text full_description["phone"] = phone_node.text email_node_container = contacts_container_node.xpath(layer_description["email_xpath"]) full_description["email"] = "" if email_node_container != []: email_node = email_node_container[0] print email_node.text full_description["email"] = email_node.text web_node_container = contacts_container_node.xpath(layer_description["web_xpath"]) full_description["web"] = "" if web_node_container != []: web_node = web_node_container[0] print web_node.text full_description["web"] = web_node.text text = "" # text_node_container = contacts_container_node.xpath(layer_description['text_xpath']) text_node_container = tree.xpath(layer_description["text_xpath"]) if text_node_container != []: text_node = text_node_container[0] text = text = text_node.text for text_part_node in text_node.xpath(layer_description["text_other_parts_xpath"]): if text_part_node.tail != None: text = text + " " + text_part_node.tail print text full_description["text"] = text social_shares_container = tree.xpath(layer_description["social_shares_container_xpath"]) if social_shares_container != []: social_shares_node = social_shares_container[0] facebook_container = social_shares_node.xpath(layer_description["facebook_xpath"]) if facebook_container != []: print facebook_container[0].attrib["href"] full_description["facebook_link"] = facebook_container[0].attrib["href"] twitter_container = social_shares_node.xpath(layer_description["twitter_xpath"]) if twitter_container != []: print twitter_container[0].attrib["href"] full_description["twitter_link"] = twitter_container[0].attrib["href"] google_container = social_shares_node.xpath(layer_description["google_xpath"]) if google_container != []: print google_container[0].attrib["href"] full_description["gplus_link"] = google_container[0].attrib["href"] """ location_container = tree.xpath(layer_description['location_container_xpath']) if location_container != []: location_node = location_container[0] latitude_container = location_node.xpath(layer_description['latitude_xpath']) if latitude_container != []: latitude_value = latitude_container[0].get('value') if latitude_value != None: print latitude_value full_description['latitude'] = latitude_value longitude_container = location_node.xpath(layer_description['longitude_xpath']) if longitude_container != []: longitude_value = longitude_container[0].get('value') if longitude_value != None: print longitude_value full_description['longitude'] = longitude_value """ latitude_container = tree.xpath(layer_description["latitude_xpath"]) if latitude_container != []: latitude_value = latitude_container[0].get("value") if latitude_value != None: print latitude_value full_description["latitude"] = latitude_value longitude_container = tree.xpath(layer_description["longitude_xpath"]) if longitude_container != []: longitude_value = longitude_container[0].get("value") if longitude_value != None: print longitude_value full_description["longitude"] = longitude_value img_container_list = tree.xpath(layer_description["img_container_xpath"]) if img_container_list != []: img_container = img_container_list[0] img_num = 0 img_list = list() for img_node in img_container.xpath(layer_description["img_node_xpath"]): img_url = img_node.attrib.get("full") print img_url if img_url != None: if img_url != "": img_list.append(img_url) img_file_name = "%s_%s" % (str(venue_id), str(img_num)) img_num = img_num + 1 img_download(img_url, img_file_name, img_dir, exceptions_log_file) full_description["img_urls"] = img_list short_description = items_urls_and_descriptions[local_url_part]["short_description"] result_dict[local_url_part] = {"short_description": short_description, "full_description": full_description} # print items_urls_and_descriptions[local_url_part] venue_id = venue_id + 1 return result_dict