def verify_id(url): """Verify the Id and public-domain status for the book""" (link_type, link) = get_link_and_type(url) if link_type == 'wildcard': if '(*)' not in link[0]: return 1 try: r = requests.head(re.sub('\(\*\)', str(link[1]), link[0])) except: return 1 if r.status_code == 404: return 1 if r.status_code != 200: return 10 if 'image' not in r.headers['content-type']: return 1 elif link_type == 'pdf': try: r = requests.head(link) except: return 10 if r.status_code == 404: return 1 if r.status_code != 200: return 10 if 'pdf' not in r.headers['content-type']: return 1 else: return 1 return 0
def get_pdf_link(tld, soup): pdf_url = OAI_metadata_content("citation_pdf_url", soup) pdf_links = False if pdf_url == "": links = soup.findAll('a') for link in links: if not link.has_key('href'): continue if '.pdf' in link.get('href'): if pdf_links == True and link.get('href') != pdf_url: return False pdf_links = True pdf_url = link.get('href') else: continue if pdf_links != True: return False if pdf_url[0]== '/': pdf_url = tld + pdf_url pdf_head = requests.head(pdf_url) if 'content-length' in pdf_head.headers.keys(): content_length = pdf_head.headers['content-length'] if int(content_length) < 1000: return False else: return pdf_url else: return False