Ejemplo n.º 1
0
 def __init__(self, context):
     super().__init__(context, DataResource.data_dir / 'doi_metadata.db')
     etiquette = Etiquette(
         'SYNTH transform', '0.1',
         'https://github.com/NaturalHistoryMuseum/synth_transform',
         '*****@*****.**')
     self.works = Works(etiquette=etiquette)
     self._handled = set()  # all the dois that are checked in this run
     self._added = set()  # all the dois that are added in this run
     self._errors = {}
Ejemplo n.º 2
0
def write_bibtex_v1(bibtex_file, dois):
    my_etiquette = Etiquette('VTLibraries', 0.1, 'https://lib.vt.edu/', '*****@*****.**')
    work = Works(etiquette=my_etiquette)
    with open(bibtex_file, 'w') as bib:
        for doi in dois:
            url = "http://api.crossref.org/works/" + doi + "/transform/application/x-bibtex"
            jsontxt = work.do_http_request('get', url, custom_header=str(work.etiquette)).text
            if not jsontxt.startswith('Resource'):
                bib.writelines(jsontxt)
                bib.write('\n')
Ejemplo n.º 3
0
 def __init__(self, context):
     super().__init__(context, DataResource.data_dir / 'output_dois.db')
     etiquette = Etiquette(
         'SYNTH transform', '0.1',
         'https://github.com/NaturalHistoryMuseum/synth_transform',
         '*****@*****.**')
     self.works = Works(etiquette=etiquette)
     self._handled = set()
     self._added = set()
     self._errors = {}
     self._methods = {}
Ejemplo n.º 4
0
def main():
    sql = connect()
    cursor = sql.cursor()

    if checkTables(cursor) == False:
        print "fatal error: tables not verified"
        return
    else:
        print "tables verified"

    # Start scraping and populating data
    # 1) Scraping Entry point: random doi using "sample" - done
    # 2) Back-propagation occurs through citations found in the paper - might have bugs with duplicates
    # 3) When no more unique papers are found, return to step 1 - done
    project = Etiquette('ResearchSub', 'Pre-alpha', 'localhost', '*****@*****.**')
    works = Works(etiquette=project)
    for item in works.sort('published').order('desc'):
        recursiveReferenceAdd(sql,cursor, item)
     
    # Commit any changes after all transactions completed
    sql.commit()
Ejemplo n.º 5
0
def check_status_by_doi_batch_id(doi_batch_id,
                                 prefix,
                                 username,
                                 password,
                                 use_test_server=False,
                                 data_type="result"):
    """Get the status of a submission by DOI batch ID.

    Parameters
    ----------
    doi_batch_id : str or int
        Batch ID for the registration submission you wish to check on.
    prefix : str
        Your organization's DOI prefix.
    username : str
        Crossref username.
    password : str
        Crossref password.
    use_test_server : bool
        If True submit to the test server instead of actually attempting to
        register DOIs. Defaults to False.
    data_type : str
        The data type you want in response. "result" will return the status
        of your submission. "content" will return the XML submitted. Defaults
        to "result".

    Returns
    -------
    requests.Response
    """
    etiquette = Etiquette(__name__, __version__, __author__, __email__)
    depositor = Depositor(prefix, username, password, etiquette,
                          use_test_server)

    response = depositor.request_doi_status_by_batch_id(
        doi_batch_id, data_type)

    return response
Ejemplo n.º 6
0
def submit_xml(doi_batch_id,
               xml,
               prefix,
               username,
               password,
               use_test_server=False):
    """Register DOIs with Crossref.
    
    Submit XML to the Crossref API to register DOIs.

    Parameters
    ----------
    doi_batch_id : str or int
        Batch ID for the registration submission.
    xml : str
        XML file to submit to Crossref API.
    prefix : str
        Your organization's DOI prefix.
    username : str
        Crossref username.
    password : str
        Crossref password.
    use_test_server : bool
        If True submit to the test server instead of actually attempting to
        register DOIs. Defaults to False.

    Returns
    -------
    requests.Response
    """
    etiquette = Etiquette(__name__, __version__, __author__, __email__)
    depositor = Depositor(prefix, username, password, etiquette,
                          use_test_server)

    response = depositor.register_doi(doi_batch_id, xml)

    return response
Ejemplo n.º 7
0
def is_crossref_work(doi):
    my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION,
                             constants.URL, constants.EMAIL)

    return Works(etiquette=my_etiquette).doi_exists(doi)
Ejemplo n.º 8
0
import subprocess as sp
import six.moves.urllib.request
import re
import shutil
import tempfile

from crossref.restful import Works, Etiquette
import bibtexparser

import papers
from papers.config import cached
from papers import logger
from papers.encoding import family_names, latex_to_unicode

my_etiquette = Etiquette('papers', papers.__version__,
                         'https://github.com/perrette/papers',
                         '*****@*****.**')


class DOIParsingError(ValueError):
    pass


class DOIRequestError(ValueError):
    pass


# PDF parsing / crossref requests
# ===============================

Ejemplo n.º 9
0
from crossref.restful import Works, Etiquette
import logging
from datetime import datetime
import time
import pandas as pd
import sys
import os
import traceback

logging.basicConfig(filename='get_papers.log', level=logging.INFO)

start_year = sys.argv[1]
years_range = range(int(start_year), 2018)

e = Etiquette('Authorship research project', 'V1.0', 'https://github.com/ilyaaltshteyn', '*****@*****.**')

w = Works(etiquette=e)
keys = [
    'DOI', 'ISSN', 'URL', 'author', 'container-title',
    'is-referenced-by-count', 'issn-type', 'issued', 'license',
    'link', 'published-online', 'published-print', 'publisher',
    'references-count', 'source', 'subject', 'title', 'type',
    ]

def run_q(year, data, starttime, counter, batch_size):
    """ year -- year to run query for. """

    backoff = 0

    q = w.query().filter(
        from_print_pub_date=str(year),
Ejemplo n.º 10
0
import urllib
import requests
from crossref.restful import Works, Etiquette
my_etiquette = Etiquette('Wikipedia quality bachelor thesis', '1.0', 'null',
                         '*****@*****.**')
str(my_etiquette)
works = Works()
works = Works(etiquette=my_etiquette)


def fetch_issns():
    with open('doien_fv.tsv') as f:
        lines = f.readlines()[1:]  # skip line 1 (table headers)

        articles = []
        for line in lines:
            articles.append({
                'issn': line.split('\t')[0].strip(),
            })

        return articles


def retrieve_data(doi_encoded, article):
    return {
        'issn': article['issn'],
        'enc': doi_encoded['ISSN'][0] if 'ISSN' in doi_encoded else 'null',
    }


def fetch_results():
Ejemplo n.º 11
0
VERSION = '0.1.0'
CONFIG = {}
ETIQUETTE = None

try:
    import yaml
    from crossref.restful import Etiquette
    with open('config.yml', 'r') as fp:
        CONFIG = yaml.safe_load(fp)
        ETIQUETTE = Etiquette('Monograph', VERSION,
                              CONFIG['monograph']['base_url'],
                              CONFIG['monograph']['contact_email'])
except FileNotFoundError:
    pass
except ModuleNotFoundError:
    pass
Ejemplo n.º 12
0
def getDoiWithCrossRef(entry, my_etiquette):
    """ Get the doi of a bibtex entry thanks to crossref.

    Parameters
    ----------
    entry : BibDatabase
        The bibtex record with  missing doi.
    my_etiquette : tuple
        A record that contains all require fields to create Etiqette object.

    Returns
    -------
    doi : string
        the doi code.

    """
    # tries counter for each entry
    count = 0
    # store if a match has been found
    match = False
    # if provide create the Etiquette object
    if my_etiquette:
        etiquette = Etiquette(*my_etiquette)
        print(etiquette)
    else:
        etiquette = None

    # create crossref api instance for request
    works = Works(etiquette=etiquette)
    # convert entry to unicode for searching
    entry_unicode = bp.customization.convert_to_unicode(entry.copy())

    # Check for mandatory field
    try:
        # extract basic fields
        author1 = entry_unicode['author'].split(',')[0].strip()
        title = entry_unicode['title'].strip()
        year = entry_unicode['year'].strip()
    except Exception:
        warnings.warn("author, title and year fields are missing in entry {}\
                      ".format(entry_unicode))
        doi = None
        return doi

    w1 = works.query(author=author1,
                     bibliographic=title).filter(
                         until_pub_date=year,
                         from_pub_date=year,
                         type='journal-article').sort('score').order('desc')
    # parse the crossref record to find the "best" match
    for item in w1:
        count += 1
        # fuzzy comprare
        ratio = SM(None, title, item['title'][0]).ratio()
        if ratio > TOL_MATCH:
            match = True
            break
        # limit the number of query
        if count > COUNT:
            print('  Reach maximal number of tries ({}) \
for this record  {}'.format(COUNT, entry_unicode))
            break

    if match:
        doi = item['DOI']
    else:
        print("  MISSING : {}, {}".format(entry_unicode['author'],
                                          entry_unicode['title']))
        doi = None

    return doi
 def set_etiquette(self, projectName, projectVersion, projectURL,
                   emailAddress):
     self.etiquette = Etiquette(projectName, projectVersion, projectURL,
                                emailAddress)
     self.set_retreiver()
 def __init__(self):
     self.etiquette = Etiquette('Voth Group Readings', '0.0alpha',
                                'My Project URL', '*****@*****.**')
     self.retreiver = Works(etiquette=self.etiquette)
Ejemplo n.º 15
0
def get_doi(entry, config):
    has_doi = bib_parser.has_doi(entry)
    my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION,
                             constants.URL, constants.EMAIL)
    max_levenshtein_distance = config.get_max_levenshtein_distance()
    update_URL = config.get_update_URL()

    works = Works(etiquette=my_etiquette)

    if not has_doi and bib_parser.has_url(entry):
        entry_url = bib_parser.get_url(entry)
        if "doi" in entry_url:
            doi = cleaner.clean_doi(entry_url)

            if is_crossref_work(doi):
                crossref_info = works.doi(doi)
                if crossref_is_similar(crossref_info, entry,
                                       max_levenshtein_distance):
                    entry = set_doi(entry, doi, update_URL)
                    has_doi = True

    if not has_doi:
        # we try to find the doi for the title
        entry_title = bib_parser.get_title(entry)
        entry_title = cleaner.clean_braces(entry_title)
        author = bib_parser.get_author(entry)
        first_author = splitname(author[0], strict_mode=False)
        first_author_last_name = first_author["last"][0]

        query_parameters = {
            "author": first_author_last_name,
            "bibliographic": entry_title
        }

        works_query = works.query(**query_parameters)
        works_query = works_query.sort("score").order("desc").select(
            ["title", "DOI"])
        i_i_item = 0
        max_items = min(works_query.count(), 10)
        works_results = iter(works_query)
        while i_i_item < max_items and not has_doi:
            i_item = next(works_results)
            if crossref_is_similar(i_item, entry, max_levenshtein_distance):
                doi = cr_parser.get_doi(i_item)
                entry = set_doi(entry, doi, update_URL)
                has_doi = True
            i_i_item += 1
    else:
        # We check to see if the doi is correct
        doi = bib_parser.get_doi(entry)
        doi = cleaner.clean_doi(doi)
        if is_crossref_work(doi):
            crossref_info = works.doi(doi)

            if crossref_is_similar(crossref_info, entry,
                                   max_levenshtein_distance):
                entry = set_doi(entry, doi, update_URL)
            else:
                entry.pop("doi", None)
                if "doi" in bib_parser.get_url(entry):
                    entry.pop("url", None)
                has_doi = False

        else:
            entry = set_doi(entry, doi, update_URL)

    return entry, has_doi
Ejemplo n.º 16
0
import feedparser

import time
from selenium import webdriver
import random

import re

import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

# my_etiquette = Etiquette('My Project Name', 'My Project version', 'My Project URL', 'My contact email')
xEtiquette = Etiquette(application_name='not yet set',
                       application_version='not yet set',
                       application_url='not yet public',
                       contact_email='*****@*****.**')

x_works = Works(etiquette=xEtiquette)


def crossref_get_records_bydoi(xDOIStr):
    '''
    get data on a given article with a doi
    # LATER : TO DO
    # ADD AFFILIATION INFO, WHERE WE HAVE IT
    '''

    xDOI_Data = x_works.doi(xDOIStr)

    # ---- Results:
Ejemplo n.º 17
0
You may use, distribute and modify this code under the
terms of the GPLv3 license.
'''

import logging
from crossref.restful import Works, Etiquette
#from habanero import Crossref, cn
try:
    from . import sqlitedb
except:
    import sqlitedb
#from ..gui import __version__

__version__ = 'v0.1alpha'

ETIQUETTE = Etiquette('MeiTing-Trunk', __version__, 'github',
                      '*****@*****.**')

LOGGER = logging.getLogger(__name__)


def fetchMetaByDOI(doi):

    works = Works(etiquette=ETIQUETTE)
    try:
        data = works.doi(doi)
    except:
        rec = 1

    if data is None:
        rec = 1
    else:
Ejemplo n.º 18
0
class Prompt(JournalAdapter):
    name = "Prompt"
    issn = '2578-9430'
    doi_prefix = '10.31719'
    uses_jats = False
    metadata_source = 'Crossref'
    base_urls = ['https://thepromptjournal.com', 'http://thepromptjournal.com']
    archive_basename = 'prompt'
    etiquette = Etiquette('PubArchiver', pubarchiver.__version__,
                          'https://github.com/caltechlibrary/pubarchiver',
                          '*****@*****.**')

    def all_articles(self):
        articles = []
        try:
            works = Works(etiquette=Prompt.etiquette)
            if __debug__:
                log(f'asking Crossref for all works by {self.doi_prefix}')
            for item in works.filter(prefix=self.doi_prefix):
                doi = item.get('DOI', '')
                title = item.get('title', [''])[0]
                online = item.get('published-online', None)
                if not online or 'date-parts' not in online:
                    if __debug__:
                        log(f'skipping {doi} lacking published-online')
                    continue
                else:
                    date = '-'.join(
                        format(x, '02') for x in online['date-parts'][0])
                    if __debug__:
                        log(f'keeping publication {doi} dated {date}')
                pdf = pdf_link(item.get('link', []))
                jats = ''
                image = ''
                basename = tail_of_doi(doi)
                status = 'complete' if all([pdf, doi, title, date
                                            ]) else 'incomplete'
                articles.append(
                    Article(self.issn, doi, date, title, basename, pdf, jats,
                            image, status))
        except Exception as ex:
            if __debug__: log(f'crossref API exception: {str(ex)}')
            raise ServerError(f'Failed to get data from Crossref: {str(ex)}')
        return articles

    def articles_from(self, doi_file):
        '''Returns a list of `Article` tuples from a file of DOIs.'''
        if __debug__: log(f'reading {doi_file}')
        requested_dois = []
        with open(doi_file, 'r') as file:
            requested_dois = [line.strip() for line in file]

        num = len(requested_dois)
        # I'd use pluralized() here, but it matches case when it adds a 's',
        # and is confused by DOI which is an acronym.  Must add 's' ourselves.
        inform(f'Found {num} DOI{"s" if num > 1 else ""} in {doi_file}.')
        if not requested_dois:
            if __debug__: log(f'could not read any lines from {doi_file}')
            return []

        all_articles = self.all_articles()
        all_dois = [article.doi for article in all_articles]
        skipped = 0
        for doi in requested_dois:
            if doi not in all_dois:
                warn(
                    f'Skipping "{doi}" because it is unknown for this journal.'
                )
                skipped += 1
        if skipped:
            kept = num - skipped
            inform(
                f'Using {kept} DOI{"s" if kept > 1 else ""} from {doi_file}.')
        return [
            article for article in all_articles
            if article.doi in requested_dois
        ]

    def article_metadata(self, article):
        try:
            works = Works(etiquette=Prompt.etiquette)
            if __debug__: log(f'asking Crossref for data about {article.doi}')
            data = works.doi(article.doi)
            year = article.date.split('-')[0]
            file = tail_of_doi(article.doi) + '.pdf'
            field = lambda key: data.get(key, '')
            if isinstance(field('license'),
                          list) and len(field('license')) > 1:
                rights_link = field('license')[0]['URL']
            else:
                rights_link = 'https://creativecommons.org/licenses/by-nc/4.0/'
            xmldict = {
                'resource': {
                    '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
                    'identifier': {
                        '@identifierType': 'DOI',
                        '#text': article.doi
                    },
                    'journal': {
                        '#text': self.name,
                    },
                    'volume': {
                        '#text': field('volume')
                    },
                    'issue': {
                        '#text': field('issue')
                    },
                    'publisher': {
                        '#text': field('publisher')
                    },
                    'publicationYear': {
                        '#text': year
                    },
                    'e-issn': {
                        '#text': self.issn
                    },
                    'file': {
                        '#text': file
                    },
                    'dates': {
                        'date': {
                            '#text': article.date
                        },
                    },
                    'titles': {
                        'title': {
                            '#text': article.title
                        }
                    },
                    'creators': {
                        'creator': creator_list(field('author')),
                    },
                    'descriptions': {
                        'description': {
                            '@descriptionType': 'Abstract',
                            '#text': strip_tags(field('abstract'))
                        }
                    },
                    'rightsList': {
                        'rights': {
                            '#text': copyright_text(field('author'), year),
                        },
                        'rightsURI': {
                            '#text': rights_link
                        }
                    },
                }
            }
            return xmldict
        except Exception as ex:
            if __debug__: log(f'crossref API exception: {str(ex)}')
            foo = ex
            import pdb
            pdb.set_trace()
Ejemplo n.º 19
0
import six.moves.urllib.request
import re
import shutil
import tempfile
import uuid

from crossref.restful import Works, Etiquette
import bibtexparser

import papers
from papers.config import cached
from papers import logger
from papers.encoding import family_names, latex_to_unicode

my_etiquette = Etiquette('papers', papers.__version__,
                         'https://github.com/rchg/papers',
                         '*****@*****.**')


class DOIParsingError(ValueError):
    pass


class DOIRequestError(ValueError):
    pass


# PDF parsing / crossref requests
# ===============================

Ejemplo n.º 20
0
from crossref.restful import Members, Etiquette, Works
import bibclean.config.constants as constants
import bibclean.crossref_tools.parser as parser

my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION,
                         constants.URL, constants.EMAIL)
works = Works(etiquette=my_etiquette)
members = Members(etiquette=my_etiquette)


def get_editors_from_ISBN(isbn, work_type):
    if work_type == "book-chapter":
        work_type = "book"
    elif work_type == "proceedings-article":
        work_type = "proceedings"
    else:
        raise KeyError("Unknown type!")

    works_results = works.query(isbn).filter(type=work_type)
    works_results = works_results.sort("score").order("desc")

    for i_work in works_results:
        if parser.has_ISBN(i_work) and parser.has_editor(i_work):
            if isbn == parser.get_ISBN(i_work):
                return parser.get_editor(i_work)

    return None


def get_member_ID_from_publisher(publisher_name):
    members_query = members.query(publisher_name).select("id")