コード例 #1
0
ファイル: scrape_xeno_canto.py プロジェクト: tompt/scra_pe2
def main_loop():
    Downloader.start()

    page = 1
    while True:
        print("page:", page)
        result = urllib2.urlopen(search + str(page)).read()

        root = lxml.html.fromstring(result)

        nodes = root.xpath("//span[@class='common-name']")

        if len(nodes) == 0:
            break

        for node in nodes:
            anchor = node.xpath("./a")[0]
            text = anchor.text_content()
            href = anchor.attrib["href"]
            url = urlparse.urljoin(search, href)
            get_sounds(url)
#			print "*****************"
#			break

        page += 1


#		break

    Downloader.end()
    Downloader.wait_for_pending_workers()
コード例 #2
0
def main_loop():
	Downloader.start()

	page = 1
	while True:
		print "page:", page
		result = urllib2.urlopen(search + str(page)).read()

		root = lxml.html.fromstring(result)

		nodes = root.xpath("//span[@class='common-name']")

		if len(nodes) == 0:
			break

		for node in nodes:
			anchor = node.xpath("./a")[0]
			text = anchor.text_content()
			href = anchor.attrib["href"]
			url = urlparse.urljoin(search, href)
			get_sounds(url)
#			print "*****************"
#			break

		page += 1
#		break

	Downloader.end()
	Downloader.wait_for_pending_workers()
コード例 #3
0
ファイル: scrape-ibc.py プロジェクト: tompt/scra_pe2
def main_loop():
    Downloader.start()

    total_species = 0
    total_species_with_sound = 0

    result = urllib2.urlopen(search).read()

    root = lxml.html.fromstring(result)

    nodes = root.xpath("//ul[@id='checklist-list']/li")
    #nodes = root.xpath("//div[@class='view view-sounds-in-species']/div/ul/li")
    #nodes = root.xpath("//div/div/ul/li")

    FF = True

    for node in nodes:

        # **************** Fast Forward to 7271

        # *****************
        total_species += 1
        sound_total = node.xpath("./span/span[@class='sound-total']")
        sound_total_value = 0
        if sound_total != None and len(sound_total) == 1:
            sound_total_value = int(sound_total[0].text_content())
            print sound_total_value

        if sound_total_value == 0:
            continue

        href = node.xpath("./a")[0].attrib["href"]
        url = urlparse.urljoin(search, href)
        print url
        #		path = urlparse.urlparse(url).path
        #		if FF == True and os.path.basename(path) != "3143":
        #			print path
        #			continue

        #		if os.path.basename(path) == "3143":
        #			FF = False

        common_name = node.xpath(
            "./a/span[@class='english']")[0].text_content()
        scientific_name = node.xpath(
            "./a/span[@class='scientific']")[0].text_content()
        print common_name, scientific_name
        data = {"scientificName": scientific_name, "commonName": common_name}

        get_sounds(url, data)

        total_species_with_sound += 1


#		break

    print "Total Species =", total_species
    print "Total Species with sounds =", total_species_with_sound

    Downloader.end()
    Downloader.wait_for_pending_workers()
コード例 #4
0
def main_loop():
	Downloader.start()

	total_species = 0
	total_species_with_sound = 0

	result = urllib2.urlopen(search).read()

	root = lxml.html.fromstring(result)

	nodes = root.xpath("//ul[@id='checklist-list']/li")
	#nodes = root.xpath("//div[@class='view view-sounds-in-species']/div/ul/li")
	#nodes = root.xpath("//div/div/ul/li")



	FF = True

	for node in nodes:

		# **************** Fast Forward to 7271

		# *****************
		total_species += 1
		sound_total = node.xpath("./span/span[@class='sound-total']")
		sound_total_value = 0
		if sound_total != None and len(sound_total) == 1:
			sound_total_value = int(sound_total[0].text_content())
			print sound_total_value

		if sound_total_value == 0:
			continue

		href = node.xpath("./a")[0].attrib["href"]
		url = urlparse.urljoin(search, href)
		print url
#		path = urlparse.urlparse(url).path
#		if FF == True and os.path.basename(path) != "3143":
#			print path
#			continue

#		if os.path.basename(path) == "3143":
#			FF = False

		common_name = node.xpath("./a/span[@class='english']")[0].text_content()
		scientific_name = node.xpath("./a/span[@class='scientific']")[0].text_content()
		print common_name, scientific_name
		data = {"scientificName":scientific_name, "commonName":common_name}

		get_sounds(url, data)

		total_species_with_sound += 1

#		break


	print "Total Species =", total_species
	print "Total Species with sounds =", total_species_with_sound

	Downloader.end()
	Downloader.wait_for_pending_workers()