Exemple #1
0
def enrich_from_crossref(bib, email):
    cr = Crossref()
    if email:
        Crossref(mailto=email)
    okcount = 0
    print("entries where no exact matching entry could be found on Crossref:")
    for entry in bib.entries:
        res = cr.works(query_bibliographic=entry["title"])
        item = get_matching_item(entry, res['message']['items'])
        if item:
            okcount += 1
            enrich_entry(entry, item)
    print(okcount, "of", len(bib.entries), "had matching titles")
def build_user_graph(graph, users, spinner, cursor):
    global crossref_email
    global vertex_dict
    global sqlite_cursor
    sqlite_cursor = cursor

    vertex_dict = {
        "paper": {},
        "journal": {},
        "subject": {},
        "author": {},
        "user": {}
    }

    total = len(users)
    counter = 1

    spinner.start()
    cr = Crossref(mailto=crossref_email)

    for uni in users:
        process_user(graph, uni, cr, counter, total, spinner)
        counter += 1

    spinner.succeed("All users inserted")
Exemple #3
0
def CrossRefAPIfunc(keyPhrase):
    cr = Crossref()
    x = cr.works(query=keyPhrase)

    if x["message"]["total-results"] > 0:
        x = x['message']["items"][0]
        date = x["indexed"]["date-parts"][0]
        #referenceCount = x["is-referenced-by-count"]
        title = x["title"][0]

        if "author" in x:
            authors = x["author"][0]["given"] + x["author"][0]["family"]
            if len(x["author"]) > 1:
                authors += " et al."
        else:
            authors = None
        url = x["URL"]
        #score = x["score"]
    else:
        date = None
        #referenceCount = None
        title = None
        author = None
        url = None
        #score = -1
    return (url, title, authors, date)
Exemple #4
0
    def __init__(self, doi):
        cr = Crossref()
        try:
            message = cr.works(doi)['message']
        except:
            message = None

        if message:
            metadata = {}

            metadata['doi'] = doi

            journal_info = {
                x['name']: x['value']
                for x in message['assertion']
            }
            metadata.update(journal_info)

            metadata['page'] = message['page'].replace('-', '--')
            metadata['volume'] = message['volume']
            metadata['author'] = ' and '.join(
                [f"{x['given']} {x['family']}" for x in message['author']])

            ts = message['license'][0]['start']['date-time']
            metadata['date'] = datetime.strptime(ts,
                                                 '%Y-%m-%dT%H:%M:%SZ').date()

            metadata[
                'article_name'] = f"{message['author'][0]['family']}_{metadata['date'].year}"

            self.metadata = metadata
Exemple #5
0
def names_from_xref(doi):
    """
    Get the first names of the first and last authors for a given DOI.

    Inputs
    ------
    doi : string
        The DOI of the paper whose first and last author names you want to know. Here, it's usually a citing paper.

    Outputs
    -------
    first_author : string
        The first name of the first author of the given paper.

    last_author : string
        The first name of the last author of the given paper.
    """
    cr = Crossref()
    title = ""
    works = cr.works(
        query=title, select=["DOI", "author"], limit=1, filter={"doi": doi}
    )
    if works["message"]["total-results"] > 0:
        item = works["message"]["items"][0]
        if "author" in item.keys():
            first_author = get_name_from_author_dict(item["author"][0])
            last_author = get_name_from_author_dict(item["author"][-1])
        else:
            first_author = ""
            last_author = ""
    return first_author, last_author
Exemple #6
0
def lookup_data(
    doi: str = None,
    in_wikipedia: bool = False,
):  # -> Dict[str, str]:
    """Lookup data and return Dict"""
    # https://www.crossref.org/education/retrieve-metadata/rest-api/
    # async client here https://github.com/izihawa/aiocrossref but only 1 contributor
    # https://github.com/sckott/habanero >6 contributors not async
    if doi is None:
        print("Error. Got None instead of DOI. Report this error please.")
    else:
        print("Looking up from Crossref")
        cr = Crossref()
        #result = cr.works(doi=doi)
        result = cr.works(ids=doi)
        # print(result.keys())
        message = result["message"]
        object_type = message["type"]
        if object_type == "book":
            print("Book detected, we exclude those for now.")
            return None
        #print(message.keys())
        data = extract_data(message, in_wikipedia)
        print(data)
        if data.get("publisher") and data.get("publisher_location"):
            # TODO look up publisher via sparqldataframe
            print("Found both publisher and location")
Exemple #7
0
def fetch_doi_from_crossref(item):
    """ link titles with dois """
    cr = Crossref()

    # goes thru all the papers and checks via crossref
    try:
        query = '"' + item["title"] + '"'\
        + " " + flatten(item["authors"])
    except TypeError:
        # No author information available (probably)
        query = '"' + item["title"] + '"'

    print(STD_INFO + query)
    server_reached = False
    while server_reached == False:
        try:
            query_result = cr.works(query=query, limit=3)
            server_reached = True
        except:
            #HTTPError (Service Unavailable)
            print(STD_WARNING +
                  "CrossRef server unavailable. Retry in 5 seconds")
            time.sleep(5)

    try:
        title = query_result['message']['items'][0]['title'][0]
    except KeyError:
        title = 'None'

    doi = query_result['message']['items'][0]['DOI']
    return doi, title
Exemple #8
0
def crossref_api(login_data, sql_dataframe, e_mail, chunk_size):
    pd.options.mode.chained_assignment = None
    match_info = sql_dataframe
    sampledata = match_info[(match_info["match_id"] == "not_match") |
                            (match_info["match_id"] == "error")]
    sampledata["crossref"] = np.nan
    cr = Crossref(mailto=e_mail)
    i = 0

    while True:
        ns = i + chunk_size
        if i < len(sampledata):
            dict_cross = []

            for index, row in sampledata[i:ns].iterrows():
                tempdata = []
                reftext = sampledata.ix[index]["ref_text"]
                try:
                    x = cr.works(
                        query=reftext,
                        limit=1,
                        select=
                        "DOI,title,issued,short-container-title,ISSN,score,URL,title,page,publisher,container-title,DOI,author,volume,issued"
                    )
                    tempdata.append(row[0])
                    tempdata.append(x["message"]["items"])
                except:
                    tempdata.append(row[0])
                    tempdata.append(np.nan)
                    print("error" + str(index))
                dict_cross.append(tempdata)
            crossref_to_db(login_data, dict_cross)
            i = i + chunk_size
        else:
            break
Exemple #9
0
def get_papers(issn=ISSN,
               offset=0,
               per_page=PER_PAGE,
               username=HABANERO_USERNAME):
    crossref_api = Crossref(mailto=username)
    return crossref_api.works(filter={"issn": issn},
                              offset=offset,
                              limit=per_page)  # get a first set of papers
Exemple #10
0
def pre_save_article(sender, instance, **kwargs):
    cr = Crossref()
    article_meta = cr.works(ids=instance.DOI)
    instance.DOI = instance.DOI.strip()
    instance.title = get_title(article_meta)
    instance.description = get_description(article_meta)
    instance.keywords = get_keywords(article_meta)
    instance.article_url = get_url(article_meta)
def doi_valid(value):
    try:
        cr = Crossref(mailto="*****@*****.**"
                      )  # necessary to end up in the polite pool
        work = cr.works(ids=value)
        work['message']['title']
    except Exception as e:
        raise ValidationError(f"Invalid DOI: {e}")
Exemple #12
0
    def __init__(self):

        # constants
        self.TRIALS = 3

        # init
        self.sh = SciHub()
        self.cr = Crossref()
Exemple #13
0
def get_doi(dois, config):
    """Retrieve an article by doi.
    """
    crossref = Crossref(mailto=config['settings']['email'])
    cr_result = crossref.works(ids=dois)
    if len(dois) == 1:
        return DB_dict.parse_cr([cr_result['message']])
    else:
        return DB_dict.parse_cr([c['message'] for c in cr_result])
 def __init__(self, rsrcmgr, pageno=1, laparams: LAParams = None):
     PDFPageAggregator.__init__(self,
                                rsrcmgr,
                                pageno=pageno,
                                laparams=laparams)
     self.rows = []
     self.page_number = 0
     self.doi = None
     self.cr = Crossref(mailto='*****@*****.**')
 def __init__(self, doi=None, parms=None):
     cr = Crossref()
     if doi:
         self.raw_data = cr.works(ids=doi,format="json")
         self.record = self.raw_data['message']
     elif parms:
         self.raw_data = cr.works(filter=parms, cursor="*", limit=500,format="json")
         self.record = None
         self.record_list = []
Exemple #16
0
def crossref(doi):
    cr = Crossref(mailto="*****@*****.**")
    response = cr.works(ids=doi)
    # habanero returns a list if doi is a list of len > 1
    # otherwise a single dict
    if isinstance(doi, (list, tuple, set)) and len(doi) > 1:
        D = [parse_crossref(i) for i in response]
        return {x.pop("doi"): x for x in D}
    else:
        return parse_crossref(response)
Exemple #17
0
def get_crossref_metadata(title, path):
    """
    Gets Crossref metadata, given an article's title. Then puts the metadata on the clipboard
    :param title: Title to search for
    :param path: PDF-Path, not necessary
    """

    print "getting crossref"

    # Searches the Crossref API for the given title, gets best result
    cr = Crossref()
    query = cr.works(query=title, limit=1)

    doi = ''

    # Extract DOI out of Crossref answer
    for item in query['message']['items']:
        doi = item['DOI']

    # Not used, but useful. Gets metadata from isbnlib, given DOI
    # print isbnlib.doi2tex(doi)

    # Gets APA citation, given DOI
    apa_citation = cn.content_negotiation(ids=doi, format="text", style="apa")

    # We could get more formats this way, but this is not used at the moment, better performance without getting these formats
    # rdf_citation = cn.content_negotiation(ids=doi, format="rdf-xml")
    # json_citation = cn.content_negotiation(ids=doi, format="citeproc-json")
    # bib_entry = cn.content_negotiation(ids=doi, format="bibentry")

    # Prettify APA citation
    apa_citation = prettify_UTF8_Strings(apa_citation).strip('\n')
    print apa_citation

    clp.OpenClipboard(None)
    citations = {}
    citations['APA'] = apa_citation
    try:
        citations['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT),
                                       errors='replace')
    except:
        citations['content'] = 'no text content available'
    # Puts the citations on the clipboard
    clp.SetClipboardData(citation_format, json.dumps(citations))

    sources = {}
    sources['source'] = path
    try:
        sources['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT),
                                     errors='replace')
    except:
        sources['content'] = 'no text content available'
    # Puts the sources on the clipboard
    clp.SetClipboardData(src_format, json.dumps(sources))
    clp.CloseClipboard()
Exemple #18
0
def title_from_DOI(DOI):
    cr = Crossref(mailto=crossref_email)
    try:
        response = cr.works(ids=DOI)
    except HTTPError:
        return None
    if "message" in response:
        if "title" in response["message"]:
            if response["message"]["title"]:
                return response["message"]["title"][0]
    return None
def build_network_graph(graph, DOIs):
	global spinner
	
	spinner.start()
	cr = Crossref(mailto = "*****@*****.**")

	for DOI in DOIs:
		process_paper(graph, DOI, cr)
	spinner.stop()

	print("Network built.")
def build_author_graph(graph, DOIs):
	global vertex_dict
	global spinner

	spinner.start()

	cr = Crossref(mailto = "*****@*****.**")

	for DOI in DOIs:
		process_author_paper(graph, DOI, cr)

	spinner.succeed("Author network built.")
Exemple #21
0
def update_from_cr(config):
    """Retrieve records from Crossref.
    """
    crossref = Crossref(mailto=config['settings']['email'])
    orcid_ids = [n for n in config['authors']]
    cr_results = crossref.works(filter={
        'orcid': orcid_ids,
        'type': [config['settings']['article_type']]
    },
                                sort=config['settings']['sort_field'],
                                order=config['settings']['order'],
                                limit=config['settings']['num_records'])
    return DB_dict.parse_cr(cr_results['message']['items'])
def build_user_graph(graph, users):
	global vertex_dict
	global spinner

	spinner.start()
	cr = Crossref(mailto = "*****@*****.**")

	for uni in users:
		process_user(graph, uni, cr)

	spinner.succeed("All users inserted")

	spinner.stop()
Exemple #23
0
def make_references(publications, output_dir):
    """
    Create reference bib file
    Args:
        publications: the list of publications
        output_dir: the output directory

    Returns:
        A list of reference identifiers
    """
    log = Logger()
    cr = Crossref()
    lines = []
    references = []

    for i, publication in enumerate(publications):
        log.notice(
            f"Querying and formatting {i + 1} out of {len(publications)} publications"
        )
        link = publication[LINK]
        title = publication[TITLE]

        # Check if it is a DOI url
        if link and "doi.org" in link:
            doi = urlparse(link).path.strip("/")

        # Extract the DOI using the title
        else:
            results = cr.works(query_bibliographic=title, limit=1)
            if (results["message"]["total-results"] == 0
                    or results["message"]["items"][0]["title"][0].lower() !=
                    title.lower()):
                log.warn(f'Could not find the doi for "{title}"')

                continue

            doi = results["message"]["items"][0]["DOI"]

        try:
            reference = cn.content_negotiation(doi)
            lines.append(reference)
            references.append(
                re.sub("^@.*{", "",
                       reference.split("\n")[0]).strip(","))
        except HTTPError:
            log.warn(f'Could not Create reference for "{title}"')

    with open(os.path.join(output_dir, "references.bib"), "w") as f:
        f.write("\n\n".join(lines))

    return references
Exemple #24
0
def get_crossref_results(query, index=10):
    cr = Crossref()
    filters = {"type": "journal-article"}
    limit = 10
    sort = "score"
    order = "desc"
    results = cr.works(
        query_bibliographic=query,
        filters=filters,
        limit=limit,
        sort=sort,
        order=order,
    )
    results = results["message"]["items"]
    return results[:index]
Exemple #25
0
 def __init__(self, email=""):
     self.mailto = email
     self.arXivApi = "http://export.arxiv.org/api/query?id_list="
     self.crossrefApi = Crossref(mailto=self.mailto)
     self.regString = r"\b(10\.[0-9]{4,}(?:\.[0-9]+)*\/(?:(?![\"&\'])\S)+)\b"
     self.metadata = {
         "doi": "",
         "url": "",
         "year": "",
         "journal": "",
         "author": "",
         "title": "",
         "abstract": "",
     }
     self.page0_text = ""
Exemple #26
0
    def __init__(self, pmidsListOrCSVfile, email):
        """
        Constructor. Used for initialization.

        :param pmidsListOrCSVfile: [list, set or string] a list (or a set) of PMIDs, or a name of a CSV file  containg  the list of PMIDs with the header 'PMID'.
        :param email: [string] an email address for Entrez and Crossref in case of need to contact you (e.g. send warnigs about download limits).

        Usage::
        from pubMedArticleGrabber import PubMedArticleGrabber
        wanted = PubMedArticleGrabber('name of csv file', '*****@*****.**')
        others = PubMedArticleGrabber(['1047458', '1050021'], '*****@*****.**')
        someOthers = PubMedArticleGrabber({'1047458', '1047458', '1050021'}, '*****@*****.**')
        """
        self.pmids = pmidsListOrCSVfile
        Entrez.email = email
        cr = Crossref(mailto=email)
def build_combined_graph(graph, DOIs):
	global vertex_dict
	global spinner

	spinner.start()
	cr = Crossref(mailto = "*****@*****.**")

	for DOI in DOIs:
		combined_process_paper(graph, DOI, cr)

	spinner.succeed("All papers inserted")
	spinner.start("Building citation edges...")

	for DOI in vertex_dict["paper"]:
		process_citations(graph, DOI, cr)

	spinner.stop()
Exemple #28
0
def async_post_save_article_info(self, doi):

    from .models import Article
    from .signals import get_abstract, get_image_url

    article = Article.objects.get(DOI=doi)

    if not article.image_url and not article.abstract:
        cr = Crossref()
        article_meta = cr.works(ids=article.DOI)
        article.abstract = get_abstract(article_meta)
        article.image_url = get_image_url(article_meta)

        if not validators.url(article.image_url):
            article.image_url = str()

        article.save()
Exemple #29
0
def make_references(publications, output_dir):
    """
    Create reference bib file
    Args:
        publications: the list of publications
        output_dir: the output directory

    Returns:
        A list of reference identifiers
    """
    log = Logger()
    cr = Crossref()
    lines = []
    references = []

    for i, publication in enumerate(publications):
        log.notice(f'Querying and formatting {i + 1} out of {len(publications)} publications')
        link = publication[LINK]
        title = publication[TITLE]

        # Check if it is a DOI url
        if link and 'doi.org' in link:
            doi = urlparse(link).path.strip('/')

        # Extract the DOI using the title
        else:
            results = cr.works(query_title=title, limit=1)
            if results['message']['total-results'] == 0 or \
                    results['message']['items'][0]['title'][0].lower() != title.lower():
                log.warn(f'Could not find the doi for "{title}"')

                continue

            doi = results['message']['items'][0]['DOI']

        try:
            reference = cn.content_negotiation(doi)
            lines.append(reference)
            references.append(re.sub('^@.*{', '', reference.split('\n')[0]).strip(','))
        except HTTPError:
            log.warn(f'Could not Create reference for "{title}"')

    with open(os.path.join(output_dir, 'references.bib'), 'w') as f:
        f.write('\n\n'.join(lines))

    return references
def my_view(request):
    references = []
    if 'DOI' in request.GET:
        doi = request.GET['DOI']
        cr = Crossref()
        logging.info(request.GET)
        # try:
        x = cr.works(doi)
        if 'message' in x and 'reference' in x['message']:
            if 'title' in x['message']:
                logging.info(u"Evaluating references for {}".format(
                    x['message']['title']))
            for r in x['message']['reference']:
                if 'DOI' not in r:
                    continue

                title = ""
                year = ""

                try:
                    y = cr.works(u'{}'.format(r['DOI']))
                    logging.info(r['DOI'])

                    if 'message' in y and 'title' in y['message']:
                        title = u"{}".format(u''.join(y['message']['title']))
                    if 'message' in y and 'author' in y['message']:
                        title += u"\n" + u", ".join(
                            [a['family'] for a in y['message']['author']])
                    if 'message' in y and 'issued' in y['message']:
                        year = y['message']['issued']['date-parts'][0][0]
                except:
                    if 'unstructured' in r:
                        title = u"{}".format(r['unstructured'])

                logging.info(title)
                references.append([
                    u"{}/{}".format(SCIHUB_URL, r['DOI']),
                    title,
                    ##authors,
                    year
                ])
        # except:
        #     pass
    logging.info(references)
    return {'refs': references}