def get_common_data(result): specimen_url = digimorph.get_specimen_url(result['specimen_url']) slice_data = None if result.get('slice_count', 0) > 0: slice_urls = digimorph.get_slice_urls(specimen_url, result.get('slice_count', 100), result.get('zero_padding', 1)) slice_data = { 'has_slices': 'slice_count' in result, 'slice_urls': json.dumps(slice_urls), 'first_slice': slice_urls[int(math.floor(len(slice_urls)/2))] } return { 'title': result.get('scientific_name') if result.get('scientific_name') else result.get('species'), 'digimorph_url': specimen_url, 'imageUrl': digimorph.get_preview_url(specimen_url), 'slice_data': slice_data, 'classification': [ result.get('phylum', None), result.get('class', None), result.get('order', None), result.get('family', None), result.get('genus', None) ] }
def get_common_data(result): specimen_url = digimorph.get_specimen_url(result['specimen_url']) slice_data = None if result.get('slice_count', 0) > 0: slice_urls = digimorph.get_slice_urls(specimen_url, result.get('slice_count', 100), result.get('zero_padding', 1)) slice_data = { 'has_slices': 'slice_count' in result, 'slice_urls': json.dumps(slice_urls), 'first_slice': slice_urls[int(math.floor(len(slice_urls) / 2))] } return { 'title': result.get('scientific_name') if result.get('scientific_name') else result.get('species'), 'digimorph_url': specimen_url, 'imageUrl': digimorph.get_preview_url(specimen_url), 'slice_data': slice_data, 'classification': [ result.get('phylum', None), result.get('class', None), result.get('order', None), result.get('family', None), result.get('genus', None) ] }
urls.extend(val['urls']) def get_page(url): print "getting " + url response = urllib2.urlopen(url) return response.read() species_data = {} date_re = re.compile('Publication Date:([^<]*)', re.DOTALL) for u in urls: data = {} try: html = get_page(digimorph.get_specimen_url(u)) soup = BeautifulSoup(html, 'html.parser') # Author and institution author_link = soup.select('.author a')[0] institution = soup.select('.institution')[0] data['author_url'] = author_link.get('href').strip() data['author_name'] = author_link.get_text().strip() data['author_name'] = author_link.get_text().strip() data['institution'] = institution.get_text().strip() # Image data image_processing = soup.body.find_all(string=re.compile('Image processing')) image_processing_links = image_processing[0].parent.find_all('a') date_string = image_processing[0].parent.contents[-1].get_text();
return max_slice - step all_species = {} for line in open('url_map.json'): all_species.update(json.loads(line)) pprint(all_species) urls = [] for val in all_species.values(): urls.extend(val['urls']) pprint(urls) slice_data = {} for u in urls: specimen_url = digimorph.get_specimen_url(u) padding = 0 max_slice = 0 has_slices = True print "Finding images for %s" % specimen_url print digimorph.get_slice_url(specimen_url, 1, 3) # 3 or 4 padding? if has_image(digimorph.get_slice_url(specimen_url, 1, 3)): padding = 3 print "Padding 3!" elif has_image(digimorph.get_slice_url(specimen_url, 1, 4)): padding = 4 print "Padding 4!" else: print "Failed to find first image with 3 or 4 padding"