def login(base=BASE,passwd=PASSWD): ic,rc = nvsoli.keepalive_init(base) ic['url'] = base ic = nvsoli.walkon(ic,records_container=rc) nvsoli.shutdown(ic) ic,rc = nvsoli.keepalive_init(base) ckstr = get_authorization_ckstr(passwd) ic['url'] = base ic['req_head']['Referer'] = base ic['req_head']['Cookie'] = ckstr ic = nvsoli.walkon(ic,records_container=rc) subtag = get_subtag(ic) ckstr = drone.append(ckstr,subtag) nvsoli.shutdown(ic) return(ckstr)
def get_EXIF(EXIF_url): info_container, records_container = taxonomy_init() info_container['url'] = EXIF_url #### ####info_container = nvsoli.walkon(info_container,records_container=records_container) ####info_container = nvsoli.auto_redireced(info_container,records_container) #### #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath('//table[@class="exif"]/tr') EXIF = {} for i in range(0, eles.__len__()): key = eles[i].xpath('td')[0].text.rstrip(':') EXIF[key] = eles[i].xpath('td')[1].text return (EXIF)
def get_country_urls(locs_url, countries_xpath='//div[@id="content"]/div/div/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = locs_url #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(countries_xpath) country_urls = [] country_names = [] for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] country_urls.append(url) name = eles[i].text country_names.append(name) return ((country_urls, country_names))
def get_page(ckstr,url,base=BASE): ic,rc = nvsoli.keepalive_init(base) ic['url'] = url ic['req_head']['Cookie'] = ckstr ic['req_head']['Referer'] = base ic = nvsoli.walkon(ic,records_container=rc) nvsoli.shutdown(ic) return(ic)
def req(url,ckstr,base=BASE,**kwargs): ic,rc = nvsoli.keepalive_init(base) ic['url'] = url ic['req_head']['Cookie'] = ckstr if("referer" in kwargs): ic['req_head']['Referer'] = kwargs['referer'] else: ic['req_head']['Referer'] = base ic = nvsoli.walkon(ic,records_container=rc) nvsoli.shutdown(ic) return(ic)
def fishbase_init(base_url='http://www.fishbase.us/'): info_container = nvsoli.new_info_container() info_container['base_url'] = base_url info_container['url'] = base_url info_container['method'] = 'GET' req_head_str = '''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36\r\nAccept-Encoding: gzip,deflate,sdch\r\nAccept-Language: en;q=1.0, zh-CN;q=0.8''' info_container['req_head'] = nvhead.build_headers_dict_from_str( req_head_str, '\r\n') info_container['req_head']['Connection'] = 'close' #### init records_container records_container = nvsoli.new_records_container() info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) return ((info_container, records_container))
def search_via_country(c_code, info_container, records_container): req_body = gen_CI_post_body(Country=c_code) ciurl = info_container['base_url'] + 'country/CountrySearchList.php' info_container['req_head']['Referer'] = info_container['base_url'] info_container['req_head']['Upgrade-Insecure-Requests'] = 1 info_container['req_head'][ 'Content-Type'] = 'application/x-www-form-urlencoded' info_container['url'] = ciurl info_container['method'] = 'POST' info_container['req_body'] = req_body info_container = nvsoli.walkon(info_container, records_container=records_container) info_container['method'] = 'GET' info_container = nvsoli.auto_redireced(info_container, records_container) html_text = info_container['resp_body_bytes'].decode('utf-8') root = etree.HTML(html_text) return (root)
def get_nav_urls(loc_url, nav_xpath='//p[@class="nav"]/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = loc_url #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): ##### #print('--------get_nav_urls--------') #print(sleep_cnt) #print(loc_url) #print('--------get_nav_urls--------') ##### sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(nav_xpath) if (eles.__len__() == 0): nav_urls = [] else: max_page = eles[-2].text max_page = int(max_page) nav_urls = [loc_url] tem = os.path.dirname(eles[-2].attrib['href'].rstrip('/')) for i in range(2, max_page + 1): url = nudipix_base_url + tem + '/' + str(i) nav_urls.append(url) return (nav_urls)
def get_locsp_urls(nav_url, locsp_xpah='//div[@class="thumbnail"]/div/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = nav_url #### #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): #print('>>>>>>>>>>>>>>>>>>>>') #print(info_container['url']) #print('<<<<<<<<<<<<<<<<<<<<') sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(locsp_xpah) locsp_urls = [] for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] if ('location' in url): locsp_urls.append(url) return (locsp_urls)
def get_location_urls( country_url, locations_xpath='//ul[@class="country_dive_site_list"]/li/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = country_url #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(locations_xpath) location_urls = [] location_names = [] for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] if ('location' in url): location_urls.append(url) name = eles[i].text location_names.append(name) return ((location_urls, location_names))
def get_country_infos(c_code, country_island_dict, info_container, records_container, **kwargs): if ('display' in kwargs): display = int(kwargs['display']) else: display = 0 if ('new_database' in kwargs): newdb = kwargs['new_database'] else: newdb = 0 #### #os.system('date') #### root = search_via_country(c_code, info_container, records_container) tables = get_all_tables(root, info_container['base_url']) qurl = get_query_url(info_container, kwargs) info_container['url'] = qurl info_container['method'] = 'GET' info_container['req_body'] = None info_container = nvsoli.walkon(info_container, records_container=records_container) html_text = info_container['resp_body_bytes'].decode('utf-8') root = etree.HTML(html_text) eles = root.xpath('//thead/tr/th') #### #### fish = {} for i in range(0, eles.__len__()): fish[eles[i].text] = None url_dict = nvurl.url_to_dict(info_container['url']) qd = nvurl.urldecode(url_dict['query']) all_country_eles = root.xpath("//tr[@class='t_value1']") country = country_island_dict[str(c_code)] fn = '../INFOS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[ 'cpresence'] + '/' + qd['vhabitat'] + '/' if (os.path.exists(fn)): pass else: os.makedirs(fn) picfn = '../PICS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[ 'cpresence'] + '/' + qd['vhabitat'] + '/' if (os.path.exists(picfn)): pass else: os.makedirs(picfn) thumbfn = '../THUMBNAILS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[ 'cpresence'] + '/' + qd['vhabitat'] + '/' if (os.path.exists(thumbfn)): pass else: os.makedirs(thumbfn) #### #### #### if (bool(newdb)): fishes = {} else: fishes_dir = fn + "fishes.dict" print(fishes_dir) if (os.path.exists(fishes_dir)): fd = open(fishes_dir, 'r+') fishes_text = fd.read() fishes = json.loads(fishes_text) fd.close() else: fishes = {} #### print('--------------------') from xdict.jprint import paint_str print( paint_str( "===============fishes loads completed======================", single_color='yellow')) print(fishes.keys()) print('----------------') #os.system('date') #### for i in range(0, all_country_eles.__len__()): fish_ele = all_country_eles[i] nfish = get_fish_info(fishes, fish_ele, fish, info_container, records_container) ###### print( paint_str( "===============nfish load completed======================", single_color='green')) ###### if (nfish): #### print("====handle new nfish========") #### nfish['eles-seq'] = i nfish['images-dir'] = picfn nfish['info-dir'] = fn fishes[nfish['Species']['name']] = nfish nfdir = fn + nfish['Species']['name'] + '/' if (os.path.exists(nfdir)): pass else: os.makedirs(nfdir) nffn = nfdir + 'fish.dict' infofn = nfdir + 'fish.info' nvft.write_to_file(fn=nffn, content=json.dumps(nfish), op='w+') info = get_printed_str(nfish, with_color=0, display=display) nvft.write_to_file(fn=infofn, content=info, op='w+') else: #### print("===bypass existed fish====") #### pass #---------------------------------------# #### print( paint_str( "===============all nfish es load completed======================", single_color='yellow')) print(fishes.keys()) print(fishes.keys().__len__()) #### dfn = fn + 'fishes.dict' if (os.path.exists(dfn)): pass else: nvft.write_to_file(fn=dfn, content=json.dumps(fishes), op='w+') ldfn = fn + 'fishes.lines' if (os.path.exists(ldfn)): pass else: nvft.write_to_file(fn=ldfn, content='', op='w+') for key in fishes: nfish = fishes[key] nvft.write_to_file(fn=ldfn, content=get_printed_str(nfish, with_color=0, display=display), op='a+') nvft.write_to_file(fn=ldfn, content='\n', op='a+') #---------------------------------------# #### print("-----get all_photos ready----") #### apafn = fn + 'pics.array' if (os.path.exists(apafn)): fd = open(apafn, 'r+') apa_text = fd.read() all_photos = json.loads(apa_text) fd.close() else: all_photos = [] for name in fishes: #all_photos = all_photos + copy.deepcopy(fishes[name]['All-Photos']) #all_photos = all_photos + copy.deepcopy(fishes[name]['All-Photos']) for photo in fishes[name]['All-Photos']: all_photos.append(photo) #### print("all_photos gotted") print(all_photos.__len__()) #### types = [] for each in all_photos: type = each['type'] if (type in types): pass else: if (type == None): pass else: types.append(type) for type in types: typefn = picfn + type if (os.path.exists(typefn)): pass else: os.makedirs(typefn) typefn = thumbfn + type if (os.path.exists(typefn)): pass else: os.makedirs(typefn) for each in all_photos: if (each['type'] == None): each['img-dir'] = None each['thumb-dir'] = None else: img_dir = picfn + each['type'] + '/' + each['img-name'] each['img-dir'] = img_dir thumb_dir = thumbfn + each['type'] + '/' + each['img-name'] each['thumb-dir'] = thumb_dir apafn = fn + 'pics.array' if (os.path.exists(apafn)): pass else: nvft.write_to_file(fn=apafn, content=json.dumps(all_photos), op='w+') lapafn = fn + 'pics.lines' if (os.path.exists(lapafn)): pass else: nvft.write_to_file(fn=lapafn, content='', op='w+') for each in all_photos: nvft.write_to_file(fn=lapafn, content=get_printed_str(each, with_color=0, display=display), op='a+') nvft.write_to_file(fn=lapafn, content='\n', op='a+') ############################ print("pics.lines and pics.array ready") ############################ imagename_dir_dict = {} dir_imagename_dict = {} for each in all_photos: if (each['type'] != None): imagename = each['img-name'] dir = each['img-dir'] else: imagename = None dir = None imagename_dir_dict[imagename] = dir dir_imagename_dict[dir] = imagename iddfn = fn + 'image_dir.dict' didfn = fn + 'dir_image.dict' if (os.path.exists(iddfn)): pass else: nvft.write_to_file(fn=iddfn, content=json.dumps(imagename_dir_dict), op='w+') liddfn = fn + 'image_dir.lines' if (os.path.exists(liddfn)): pass else: nvft.write_to_file(fn=liddfn, content='', op='w+') for each in imagename_dir_dict: nvft.write_to_file(fn=liddfn, content=get_printed_str(each, with_color=0, display=display), op='a+') nvft.write_to_file(fn=liddfn, content='\n', op='a+') if (os.path.exists(didfn)): pass else: nvft.write_to_file(fn=didfn, content=json.dumps(dir_imagename_dict), op='w+') ldidfn = fn + 'dir_image.lines' if (os.path.exists(ldidfn)): pass else: nvft.write_to_file(fn=ldidfn, content=get_printed_str(dir_imagename_dict, with_color=0, display=display), op='w+') ############### print("==dir_image.dict and dir_image.lines gotted==") ############## thumb_dir_dict = {} dir_thumb_dict = {} for each in all_photos: if (each['type'] != None): imagename = each['img-name'] dir = each['thumb-dir'] else: imagename = None dir = None thumb_dir_dict[imagename] = dir dir_thumb_dict[dir] = imagename iddfn = fn + 'thumb_dir.dict' didfn = fn + 'dir_thumb.dict' if (os.path.exists(iddfn)): pass else: nvft.write_to_file(fn=iddfn, content=json.dumps(thumb_dir_dict), op='w+') liddfn = fn + 'thumb_dir.lines' if (os.path.exists(liddfn)): pass else: nvft.write_to_file(fn=liddfn, content='', op='w+') for each in thumb_dir_dict: nvft.write_to_file(fn=liddfn, content=get_printed_str(each, with_color=0, display=display), op='a+') nvft.write_to_file(fn=liddfn, content='\n', op='a+') if (os.path.exists(didfn)): pass else: nvft.write_to_file(fn=didfn, content=json.dumps(dir_thumb_dict), op='w+') ldidfn = fn + 'dir_thumb.lines' if (os.path.exists(ldidfn)): pass else: nvft.write_to_file(fn=ldidfn, content=get_printed_str(dir_thumb_dict, with_color=0, display=display), op='w+') ############### print("===dir_thumb.lines and thumb_dir.dict gotted===") ############### print("begin download images") ############### for each in all_photos: if (each['type'] != None): imagename = each['img-name'] img_dir = each['img-dir'] img_url = each['img-url'] thumb_dir = each['thumb-dir'] thumb_url = each['thumbnail-url'] if (os.path.exists(img_dir)): #### print(paint_str("pass_by_pic", single_color="red")) #### pass else: info_container['url'] = img_url info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) nvft.write_to_file(fn=img_dir, content=info_container['resp_body_bytes'], op='wb+') #### print("downloaded one pic") #### if (os.path.exists(thumb_dir)): #### print(paint_str("pass_by_thumb", single_color="red")) #### pass else: info_container['url'] = thumb_url info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) nvft.write_to_file(fn=thumb_dir, content=info_container['resp_body_bytes'], op='wb+') #### print("downloaded one thumb") #### else: print("---external pics not downloaded in this version,pass--") pass return ((info_container, records_container))
def get_fish_info(fishes, ele_fish, fish, info_container, records_container): new_fish = copy.deepcopy(fish) eles = ele_fish.getchildren() new_fish['Family'] = eles[0].text new_fish['Species'] = {} new_fish['Species']['name'] = eles[1].xpath("i/a")[0].text nfs_name = new_fish['Species']['name'].replace(' ', '_') #### print(new_fish['Species']['name']) #### if (new_fish['Species']['name'] in fishes): #### print('----return None----') #### return (None) else: #### print('------continue new fish------') #### pass new_fish['Species']['id'] = int( nvurl.urldecode(eles[1].xpath("i/a")[0].get('href'))['id']) new_fish['Species']['url'] = info_container[ 'base_url'] + 'country/' + eles[1].xpath("i/a")[0].get('href') new_fish['Author'] = eles[2].text new_fish['Info'] = eles[3].text regex = re.compile('[\r\n\t]+') new_fish['Info'] = regex.sub('', new_fish['Info']) new_fish['Occurrence'] = eles[4].text.strip('\xa0').strip(' ').strip( '\xa0') names = eles[5].text.split(',') new_fish['Common names'] = [] for i in range(0, names.__len__()): new_fish['Common names'].append( names[i].strip('').strip('\xa0').strip('')) new_fish['Abundance'] = eles[6].text.strip('\xa0').strip(' ').strip('\xa0') new_fish['Max length'] = eles[7].text.strip('\xa0').strip(' ').strip( '\xa0') new_fish['Maturity'] = eles[8].text.strip('\xa0').strip(' ').strip('\xa0') new_fish['Remark'] = eles[9].text.strip('\xa0').strip(' ').strip('\xa0') new_fish['Photo'] = {} if (eles.__len__() >= 10): temp = eles[10].xpath('a') if (temp.__len__() > 0): new_fish['Photo'][ 'url'] = info_container['base_url'] + temp[0].get('href') else: new_fish['Photo']['url'] = None ##### new_fish['Presenting-Photo'] = {} try: new_fish['Presenting-Photo']['ID'] = int( nvurl.urldecode(eles[10].xpath('a')[0].get('href')) ['/photos/ThumbnailsSummary.php?ID']) new_fish['Presenting-Photo']['thumbnail-url'] = info_container[ 'base_url'].rstrip('/') + eles[10].xpath('a/img')[0].get('src') except: new_fish['Presenting-Photo']['ID'] = None new_fish['Presenting-Photo']['thumbnail-url'] = None else: pass else: new_fish['Photo']['url'] = None new_fish['Presenting-Photo'] = {} new_fish['Presenting-Photo']['ID'] = None new_fish['Presenting-Photo']['thumbnail-url'] = None new_fish['All-Photos'] = [] info_container['url'] = new_fish['Photo']['url'] if (info_container['url'] == None): pass else: info_container = nvsoli.walkon(info_container, records_container=records_container) html_text = info_container['resp_body_bytes'].decode('utf-8', 'ignore') root = etree.HTML(html_text) #eles = root.xpath("//tr[@class='t_value1']") #eles = root.xpath("//tr[@align]") eles = root.xpath("//td[(@align) and (@width)]") for i in range(0, eles.__len__()): ele = eles[i] #photo_eles = ele.xpath('td') photo_eles = [ele] for j in range(0, photo_eles.__len__()): photo = {} photo_ele = photo_eles[j] tooltip = photo_ele.xpath("a[@class='tooltip']") if (tooltip.__len__() > 0): rel_sum_url = tooltip[0].get('href') if (rel_sum_url.strip(' \t\r\n') == '#'): photo['summary-url'] = '#' else: if ('http' in rel_sum_url): photo['summary-url'] = rel_sum_url.strip( '.').strip('/').strip('.') else: photo['summary-url'] = info_container[ 'base_url'] + 'photos/' + rel_sum_url.strip( '.').strip('/').strip('.') else: rel_sum_url = None photo['summary-url'] = None #regex = re.compile('what=(.*)') #m = regex.search(photo['summary-url']) if (photo['summary-url'] == None): photo['type'] = None photo['external-url'] = None photo['colaborator-url'] = None photo['thumbnail-url'] = None photo['img-url'] = None photo['photographer'] = None elif (photo['summary-url'] == '#'): photo['external-url'] = None photo['colaborator-url'] = None photo['thumbnail-url'] = info_container[ 'base_url'] + 'photos/' + tooltip[0].xpath( 'img')[0].get('src') photo['img-url'] = tooltip[0].xpath('span/img')[0].get( 'src') photo['type'] = 'uploads' text = itertext(tooltip[0].xpath('span')[0]) regex = re.compile('<.*>') photo['photographer'] = regex.sub('', text).strip('\r\n\t ') regex = re.compile('.*/(.*)') m = regex.search(photo['img-url']) img_name = nfs_name + '__' + m.group(1) regex = re.compile('.*\.(.*)') try: img_type = regex.search(m.group(1)).group(1) except: img_type = None img_name = None photo['img-url'] = None photo['thumbnail-url'] = None else: pass photo['img-type'] = img_type photo['img-name'] = img_name else: if ('http' in rel_sum_url): photo['external-url'] = photo['summary-url'] photo['type'] = None else: photo['external-url'] = None if ('/Diseases/' in photo['summary-url']): photo['type'] = 'Diseases' else: photo['type'] = nvurl.urldecode( photo['summary-url'])['what'] if ('http' in photo_ele.xpath("a[not(@class)]")[0].get( 'href')): photo['colaborator-url'] = photo_ele.xpath( "a[not(@class)]")[0].get('href') else: photo['colaborator-url'] = info_container[ 'base_url'].strip('/') + photo_ele.xpath( "a[not(@class)]")[0].get('href') if ('http' in photo_ele.xpath('a/img')[0].get('src')): photo['thumbnail-url'] = photo_ele.xpath( 'a/img')[0].get('src').strip('.') else: photo['thumbnail-url'] = info_container[ 'base_url'].strip('/') + photo_ele.xpath( 'a/img')[0].get('src').strip('.') if ('http' in photo_ele.xpath('a/span/img')[0].get( 'src')): photo['img-url'] = photo_ele.xpath( 'a/span/img')[0].get('src').strip('.') else: photo['img-url'] = info_container[ 'base_url'].strip('/') + photo_ele.xpath( 'a/span/img')[0].get('src').strip('.') regex = re.compile('.*/(.*)') m = regex.search(photo['img-url']) img_name = nfs_name + '__' + m.group(1) regex = re.compile('.*\.(.*)') try: img_type = regex.search(m.group(1)).group(1) except: img_type = None img_name = None photo['img-url'] = None photo['thumbnail-url'] = None else: pass photo['img-type'] = img_type photo['img-name'] = img_name photo['photographer'] = itertext( photo_ele.xpath('a/span')[0]) regex = re.compile('[\r\n\t]+') photo['photographer'] = regex.sub( '', photo['photographer']) new_fish['All-Photos'].append(photo) return (new_fish)
def get_img_info(img_url, thumbnail_url, country_abbrev, location, base_url=nudipix_base_url): info_container, records_container = taxonomy_init() info_container['url'] = img_url #### #sys.stdout.flush() #print('---------------') #print(img_url) #sys.stdout.flush() #### #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### img_root = get_etree_root(info_container) tbodys = img_root.xpath('//table') sp = img_root.xpath('//div/div/h2/a')[0].attrib['href'].rstrip('/') sp_name = os.path.basename(sp) info_raw = tbodys[0].getchildren() info = {} for i in range(0, info_raw.__len__()): key = info_raw[i].xpath('td')[0].text.rstrip(':') if (key == 'Camera'): info[key] = info_raw[i].xpath('td')[1].text EXIF_url = nudipix_base_url + info_raw[i].xpath( 'td/span/a')[0].attrib['href'] info['EXIF'] = get_EXIF(EXIF_url) elif (key == 'Taken on'): info[key] = info_raw[i].xpath('td')[1].text elif (key == 'Viewed'): info[key] = info_raw[i].xpath('td')[1].text elif (key == 'Posted'): info[key] = info_raw[i].xpath('td')[1].text elif (key == 'Updated'): info[key] = info_raw[i].xpath('td')[1].text else: info[key] = info_raw[i].xpath('td/a')[0].text kpcofgs = get_KPCOFGS(tbodys, rsltin='dict') info['kpcofgs'] = kpcofgs img_real_url = nudipix_base_url + img_root.xpath( '//div/img')[0].attrib['src'] try: img_verifier = img_root.xpath('//div/img')[1].attrib['title'] except: img_verifier = '' else: pass sha1 = hashlib.sha1(img_real_url.encode('utf-8')).hexdigest() img_suffix = os.path.basename(img_real_url).split('.')[-1] img_name = sp_name + '_' + sha1 + '.' + img_suffix thumbnail_suffix = os.path.basename(thumbnail_url).split('.')[-1] thumbnail_name = sp_name + '_' + sha1 + '.thumbnail.' + thumbnail_suffix info_name = sp_name + '_' + sha1 + '.dict' info['img_url'] = img_real_url info['verifier'] = img_verifier info['img_name'] = images_dir + img_name info['index'] = sha1 info['thumbnail_url'] = thumbnail_url info['thumbnail_name'] = thumbs_dir + thumbnail_name info['info_name'] = infos_dir + info_name info['country'] = country_abbrev info['location'] = location #### #print(img_real_url) try: info['seq'] = int(os.path.basename(img_real_url).split('.')[0]) except: info['seq'] = -1 else: pass #print('-------------') return (info)
def get_img_urls(locsp_url, img_xpath='//div[@class="thumbnail"]/div/a[@href]'): #### #sys.stdout.flush() #print(locsp_url) #sys.stdout.flush() #### info_container, records_container = taxonomy_init() info_container['url'] = locsp_url #### #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### #### sleep_cnt = 0 while (1): #print('-------------------') #print(locsp_url) #print('-------------------') sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(img_xpath) img_urls = [] thumbnail_urls = [] #### #### for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] if (('photo' in url) & (not ('photographer' in url))): img_urls.append(url) ele = eles[i].xpath('img')[0] thumbnail_urls.append(nudipix_base_url + ele.attrib['src']) nav_xpath = '//p[@class="nav"]/a[@href]' eles = root.xpath(nav_xpath) if (eles.__len__() == 0): pass else: max_page = eles[-2].text max_page = int(max_page) tem = os.path.dirname(eles[-2].attrib['href'].rstrip('/')) for i in range(2, max_page + 1): nav_url = nudipix_base_url + tem + '/' + str(i) info_container, records_container = taxonomy_init() info_container['url'] = nav_url #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced( info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(img_xpath) for j in range(0, eles.__len__()): url = nudipix_base_url + eles[j].attrib['href'] if (('photo' in url) & (not ('photographer' in url))): img_urls.append(url) ele = eles[j].xpath('img')[0] thumbnail_urls.append(nudipix_base_url + ele.attrib['src']) return ((img_urls, thumbnail_urls))
nvft.write_to_file(fn=info['info_name'], content=json.dumps(info), op='w+') info_container, records_container = taxonomy_init() info_container['url'] = info['img_url'] #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced( info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### #sys.stdout.flush() #print(info['img_name']) #print(info['seq']) #print(info['index']) #print(info['img_url']) #print(info_container['resp_body_bytes'][:50]) #sys.stdout.flush()
def get_species(root): eles_sps = root.xpath( '//tr/td/span/a | //tr/td/em/strong/a | //tr/td/a | //tr/td/strong/a | //tr/td/em/a | //tr/td/a | //tr/td/strong/em/a' ) new_eles_sps = [] for i in range(0, eles_sps.__len__()): if (('#' in eles_sps[i].attrib['href']) | ('strombidae' in eles_sps[i].attrib['href']) | ('images' in eles_sps[i].attrib['href'])): new_eles_sps.append(eles_sps[i]) else: pass del new_eles_sps[-1] ##################### ele_cnames = [] for i in range(0, new_eles_sps.__len__()): td_parent = new_eles_sps[i].getparent() while (td_parent.tag != 'td'): td_parent = td_parent.getparent() td_next = td_parent.getnext() ele_cnames.append(td_next) ##################### urls = [] for i in range(0, new_eles_sps.__len__()): urls.append(ryan_base_url + new_eles_sps[i].attrib['href']) ##################### dir_names = [] for i in range(0, new_eles_sps.__len__()): dir_names.append(new_eles_sps[i].attrib['href'].replace('.htm', '').replace( '#', ' ')) #################### new_urls_set = set({}) for i in range(0, urls.__len__()): url = urls[i] url = url.split('#')[0] new_urls_set.add(url) #################### image_urls = [] for url in new_urls_set: info_container['url'] = url info_container = nvsoli.walkon(info_container, records_container=records_container) root = get_etree_root(info_container) eles = root.xpath('//tr/td/div/img') for j in range(0, eles.__len__()): image_urls.append( (ryan_base_url + eles[j].attrib['src']).replace(' ', '%20')) ##################### mirror_indexes = {} image_names = [] info_names = [] infos = [] for i in range(0, image_urls.__len__()): suffix = image_urls[i].split('.')[-1] arr = os.path.basename(image_urls[i]).split('%20') name = arr[0] + ' ' + arr[1].rstrip(',').rstrip('.').rstrip(' ') + '_' name = name + hashlib.sha1(image_urls[i].encode('utf-8')).hexdigest() name = name + '.' + suffix image_names.append(name) info_names.append(name + '.' + 'info') info = {} info['origin'] = image_urls[i] info['path'] = '' info['details'] = {} infos.append(info) mirror_indexes[name] = image_urls[i] mirror_indexes[image_urls[i]] = name info_container['url'] = image_urls[i] info_container = nvsoli.walkon(info_container, records_container=records_container) nvft.write_to_file(fn=photosdir + '/' + image_names[i], op='wb', content=info_container['resp_body_bytes']) nvft.write_to_file(fn=photosdir + '/' + 'indexes.dict', op='w', content=json.dumps(mirror_indexes))