def main_loop(): Downloader.start() page = 1 while True: print("page:", page) result = urllib2.urlopen(search + str(page)).read() root = lxml.html.fromstring(result) nodes = root.xpath("//span[@class='common-name']") if len(nodes) == 0: break for node in nodes: anchor = node.xpath("./a")[0] text = anchor.text_content() href = anchor.attrib["href"] url = urlparse.urljoin(search, href) get_sounds(url) # print "*****************" # break page += 1 # break Downloader.end() Downloader.wait_for_pending_workers()
def main_loop(): Downloader.start() page = 1 while True: print "page:", page result = urllib2.urlopen(search + str(page)).read() root = lxml.html.fromstring(result) nodes = root.xpath("//span[@class='common-name']") if len(nodes) == 0: break for node in nodes: anchor = node.xpath("./a")[0] text = anchor.text_content() href = anchor.attrib["href"] url = urlparse.urljoin(search, href) get_sounds(url) # print "*****************" # break page += 1 # break Downloader.end() Downloader.wait_for_pending_workers()
def main_loop(): Downloader.start() total_species = 0 total_species_with_sound = 0 result = urllib2.urlopen(search).read() root = lxml.html.fromstring(result) nodes = root.xpath("//ul[@id='checklist-list']/li") #nodes = root.xpath("//div[@class='view view-sounds-in-species']/div/ul/li") #nodes = root.xpath("//div/div/ul/li") FF = True for node in nodes: # **************** Fast Forward to 7271 # ***************** total_species += 1 sound_total = node.xpath("./span/span[@class='sound-total']") sound_total_value = 0 if sound_total != None and len(sound_total) == 1: sound_total_value = int(sound_total[0].text_content()) print sound_total_value if sound_total_value == 0: continue href = node.xpath("./a")[0].attrib["href"] url = urlparse.urljoin(search, href) print url # path = urlparse.urlparse(url).path # if FF == True and os.path.basename(path) != "3143": # print path # continue # if os.path.basename(path) == "3143": # FF = False common_name = node.xpath( "./a/span[@class='english']")[0].text_content() scientific_name = node.xpath( "./a/span[@class='scientific']")[0].text_content() print common_name, scientific_name data = {"scientificName": scientific_name, "commonName": common_name} get_sounds(url, data) total_species_with_sound += 1 # break print "Total Species =", total_species print "Total Species with sounds =", total_species_with_sound Downloader.end() Downloader.wait_for_pending_workers()
def main_loop(): Downloader.start() total_species = 0 total_species_with_sound = 0 result = urllib2.urlopen(search).read() root = lxml.html.fromstring(result) nodes = root.xpath("//ul[@id='checklist-list']/li") #nodes = root.xpath("//div[@class='view view-sounds-in-species']/div/ul/li") #nodes = root.xpath("//div/div/ul/li") FF = True for node in nodes: # **************** Fast Forward to 7271 # ***************** total_species += 1 sound_total = node.xpath("./span/span[@class='sound-total']") sound_total_value = 0 if sound_total != None and len(sound_total) == 1: sound_total_value = int(sound_total[0].text_content()) print sound_total_value if sound_total_value == 0: continue href = node.xpath("./a")[0].attrib["href"] url = urlparse.urljoin(search, href) print url # path = urlparse.urlparse(url).path # if FF == True and os.path.basename(path) != "3143": # print path # continue # if os.path.basename(path) == "3143": # FF = False common_name = node.xpath("./a/span[@class='english']")[0].text_content() scientific_name = node.xpath("./a/span[@class='scientific']")[0].text_content() print common_name, scientific_name data = {"scientificName":scientific_name, "commonName":common_name} get_sounds(url, data) total_species_with_sound += 1 # break print "Total Species =", total_species print "Total Species with sounds =", total_species_with_sound Downloader.end() Downloader.wait_for_pending_workers()