Ejemplo n.º 1
0
def verify_id(url): 
    """Verify the Id and public-domain status for the book"""   
    (link_type, link) = get_link_and_type(url)
    if link_type == 'wildcard':
        if '(*)' not in link[0]: 
            return 1
        try:
            r = requests.head(re.sub('\(\*\)', str(link[1]), link[0]))
        except:
            return 1
        if r.status_code == 404:
            return 1
        if r.status_code != 200:
            return 10
        if 'image' not in r.headers['content-type']:
            return 1
    elif link_type == 'pdf':
        try:
            r = requests.head(link)
        except:
            return 10
        if r.status_code == 404:
	        return 1
        if r.status_code != 200:
            return 10
        if 'pdf' not in r.headers['content-type']:
            return 1
    else:
        return 1
    return 0
Ejemplo n.º 2
0
def get_pdf_link(tld, soup):
    pdf_url = OAI_metadata_content("citation_pdf_url", soup)
    pdf_links = False
    if pdf_url == "":
        links = soup.findAll('a')
        for link in links:
            if not link.has_key('href'):
                continue
            if '.pdf' in link.get('href'):
                if pdf_links == True and link.get('href') != pdf_url:
                    return False                    
                pdf_links = True
                pdf_url = link.get('href')
            else:
                continue
        if pdf_links != True:
            return False
    if pdf_url[0]== '/':
        pdf_url = tld + pdf_url
    pdf_head = requests.head(pdf_url)
    if 'content-length' in pdf_head.headers.keys():
        content_length = pdf_head.headers['content-length']
        if int(content_length) < 1000:
            return False   
        else:
            return pdf_url   
    else:
        return False