def __init__(self, context): super().__init__(context, DataResource.data_dir / 'doi_metadata.db') etiquette = Etiquette( 'SYNTH transform', '0.1', 'https://github.com/NaturalHistoryMuseum/synth_transform', '*****@*****.**') self.works = Works(etiquette=etiquette) self._handled = set() # all the dois that are checked in this run self._added = set() # all the dois that are added in this run self._errors = {}
def write_bibtex_v1(bibtex_file, dois): my_etiquette = Etiquette('VTLibraries', 0.1, 'https://lib.vt.edu/', '*****@*****.**') work = Works(etiquette=my_etiquette) with open(bibtex_file, 'w') as bib: for doi in dois: url = "http://api.crossref.org/works/" + doi + "/transform/application/x-bibtex" jsontxt = work.do_http_request('get', url, custom_header=str(work.etiquette)).text if not jsontxt.startswith('Resource'): bib.writelines(jsontxt) bib.write('\n')
def __init__(self, context): super().__init__(context, DataResource.data_dir / 'output_dois.db') etiquette = Etiquette( 'SYNTH transform', '0.1', 'https://github.com/NaturalHistoryMuseum/synth_transform', '*****@*****.**') self.works = Works(etiquette=etiquette) self._handled = set() self._added = set() self._errors = {} self._methods = {}
def main(): sql = connect() cursor = sql.cursor() if checkTables(cursor) == False: print "fatal error: tables not verified" return else: print "tables verified" # Start scraping and populating data # 1) Scraping Entry point: random doi using "sample" - done # 2) Back-propagation occurs through citations found in the paper - might have bugs with duplicates # 3) When no more unique papers are found, return to step 1 - done project = Etiquette('ResearchSub', 'Pre-alpha', 'localhost', '*****@*****.**') works = Works(etiquette=project) for item in works.sort('published').order('desc'): recursiveReferenceAdd(sql,cursor, item) # Commit any changes after all transactions completed sql.commit()
def check_status_by_doi_batch_id(doi_batch_id, prefix, username, password, use_test_server=False, data_type="result"): """Get the status of a submission by DOI batch ID. Parameters ---------- doi_batch_id : str or int Batch ID for the registration submission you wish to check on. prefix : str Your organization's DOI prefix. username : str Crossref username. password : str Crossref password. use_test_server : bool If True submit to the test server instead of actually attempting to register DOIs. Defaults to False. data_type : str The data type you want in response. "result" will return the status of your submission. "content" will return the XML submitted. Defaults to "result". Returns ------- requests.Response """ etiquette = Etiquette(__name__, __version__, __author__, __email__) depositor = Depositor(prefix, username, password, etiquette, use_test_server) response = depositor.request_doi_status_by_batch_id( doi_batch_id, data_type) return response
def submit_xml(doi_batch_id, xml, prefix, username, password, use_test_server=False): """Register DOIs with Crossref. Submit XML to the Crossref API to register DOIs. Parameters ---------- doi_batch_id : str or int Batch ID for the registration submission. xml : str XML file to submit to Crossref API. prefix : str Your organization's DOI prefix. username : str Crossref username. password : str Crossref password. use_test_server : bool If True submit to the test server instead of actually attempting to register DOIs. Defaults to False. Returns ------- requests.Response """ etiquette = Etiquette(__name__, __version__, __author__, __email__) depositor = Depositor(prefix, username, password, etiquette, use_test_server) response = depositor.register_doi(doi_batch_id, xml) return response
def is_crossref_work(doi): my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION, constants.URL, constants.EMAIL) return Works(etiquette=my_etiquette).doi_exists(doi)
import subprocess as sp import six.moves.urllib.request import re import shutil import tempfile from crossref.restful import Works, Etiquette import bibtexparser import papers from papers.config import cached from papers import logger from papers.encoding import family_names, latex_to_unicode my_etiquette = Etiquette('papers', papers.__version__, 'https://github.com/perrette/papers', '*****@*****.**') class DOIParsingError(ValueError): pass class DOIRequestError(ValueError): pass # PDF parsing / crossref requests # ===============================
from crossref.restful import Works, Etiquette import logging from datetime import datetime import time import pandas as pd import sys import os import traceback logging.basicConfig(filename='get_papers.log', level=logging.INFO) start_year = sys.argv[1] years_range = range(int(start_year), 2018) e = Etiquette('Authorship research project', 'V1.0', 'https://github.com/ilyaaltshteyn', '*****@*****.**') w = Works(etiquette=e) keys = [ 'DOI', 'ISSN', 'URL', 'author', 'container-title', 'is-referenced-by-count', 'issn-type', 'issued', 'license', 'link', 'published-online', 'published-print', 'publisher', 'references-count', 'source', 'subject', 'title', 'type', ] def run_q(year, data, starttime, counter, batch_size): """ year -- year to run query for. """ backoff = 0 q = w.query().filter( from_print_pub_date=str(year),
import urllib import requests from crossref.restful import Works, Etiquette my_etiquette = Etiquette('Wikipedia quality bachelor thesis', '1.0', 'null', '*****@*****.**') str(my_etiquette) works = Works() works = Works(etiquette=my_etiquette) def fetch_issns(): with open('doien_fv.tsv') as f: lines = f.readlines()[1:] # skip line 1 (table headers) articles = [] for line in lines: articles.append({ 'issn': line.split('\t')[0].strip(), }) return articles def retrieve_data(doi_encoded, article): return { 'issn': article['issn'], 'enc': doi_encoded['ISSN'][0] if 'ISSN' in doi_encoded else 'null', } def fetch_results():
VERSION = '0.1.0' CONFIG = {} ETIQUETTE = None try: import yaml from crossref.restful import Etiquette with open('config.yml', 'r') as fp: CONFIG = yaml.safe_load(fp) ETIQUETTE = Etiquette('Monograph', VERSION, CONFIG['monograph']['base_url'], CONFIG['monograph']['contact_email']) except FileNotFoundError: pass except ModuleNotFoundError: pass
def getDoiWithCrossRef(entry, my_etiquette): """ Get the doi of a bibtex entry thanks to crossref. Parameters ---------- entry : BibDatabase The bibtex record with missing doi. my_etiquette : tuple A record that contains all require fields to create Etiqette object. Returns ------- doi : string the doi code. """ # tries counter for each entry count = 0 # store if a match has been found match = False # if provide create the Etiquette object if my_etiquette: etiquette = Etiquette(*my_etiquette) print(etiquette) else: etiquette = None # create crossref api instance for request works = Works(etiquette=etiquette) # convert entry to unicode for searching entry_unicode = bp.customization.convert_to_unicode(entry.copy()) # Check for mandatory field try: # extract basic fields author1 = entry_unicode['author'].split(',')[0].strip() title = entry_unicode['title'].strip() year = entry_unicode['year'].strip() except Exception: warnings.warn("author, title and year fields are missing in entry {}\ ".format(entry_unicode)) doi = None return doi w1 = works.query(author=author1, bibliographic=title).filter( until_pub_date=year, from_pub_date=year, type='journal-article').sort('score').order('desc') # parse the crossref record to find the "best" match for item in w1: count += 1 # fuzzy comprare ratio = SM(None, title, item['title'][0]).ratio() if ratio > TOL_MATCH: match = True break # limit the number of query if count > COUNT: print(' Reach maximal number of tries ({}) \ for this record {}'.format(COUNT, entry_unicode)) break if match: doi = item['DOI'] else: print(" MISSING : {}, {}".format(entry_unicode['author'], entry_unicode['title'])) doi = None return doi
def set_etiquette(self, projectName, projectVersion, projectURL, emailAddress): self.etiquette = Etiquette(projectName, projectVersion, projectURL, emailAddress) self.set_retreiver()
def __init__(self): self.etiquette = Etiquette('Voth Group Readings', '0.0alpha', 'My Project URL', '*****@*****.**') self.retreiver = Works(etiquette=self.etiquette)
def get_doi(entry, config): has_doi = bib_parser.has_doi(entry) my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION, constants.URL, constants.EMAIL) max_levenshtein_distance = config.get_max_levenshtein_distance() update_URL = config.get_update_URL() works = Works(etiquette=my_etiquette) if not has_doi and bib_parser.has_url(entry): entry_url = bib_parser.get_url(entry) if "doi" in entry_url: doi = cleaner.clean_doi(entry_url) if is_crossref_work(doi): crossref_info = works.doi(doi) if crossref_is_similar(crossref_info, entry, max_levenshtein_distance): entry = set_doi(entry, doi, update_URL) has_doi = True if not has_doi: # we try to find the doi for the title entry_title = bib_parser.get_title(entry) entry_title = cleaner.clean_braces(entry_title) author = bib_parser.get_author(entry) first_author = splitname(author[0], strict_mode=False) first_author_last_name = first_author["last"][0] query_parameters = { "author": first_author_last_name, "bibliographic": entry_title } works_query = works.query(**query_parameters) works_query = works_query.sort("score").order("desc").select( ["title", "DOI"]) i_i_item = 0 max_items = min(works_query.count(), 10) works_results = iter(works_query) while i_i_item < max_items and not has_doi: i_item = next(works_results) if crossref_is_similar(i_item, entry, max_levenshtein_distance): doi = cr_parser.get_doi(i_item) entry = set_doi(entry, doi, update_URL) has_doi = True i_i_item += 1 else: # We check to see if the doi is correct doi = bib_parser.get_doi(entry) doi = cleaner.clean_doi(doi) if is_crossref_work(doi): crossref_info = works.doi(doi) if crossref_is_similar(crossref_info, entry, max_levenshtein_distance): entry = set_doi(entry, doi, update_URL) else: entry.pop("doi", None) if "doi" in bib_parser.get_url(entry): entry.pop("url", None) has_doi = False else: entry = set_doi(entry, doi, update_URL) return entry, has_doi
import feedparser import time from selenium import webdriver import random import re import requests import json from bs4 import BeautifulSoup import pandas as pd # my_etiquette = Etiquette('My Project Name', 'My Project version', 'My Project URL', 'My contact email') xEtiquette = Etiquette(application_name='not yet set', application_version='not yet set', application_url='not yet public', contact_email='*****@*****.**') x_works = Works(etiquette=xEtiquette) def crossref_get_records_bydoi(xDOIStr): ''' get data on a given article with a doi # LATER : TO DO # ADD AFFILIATION INFO, WHERE WE HAVE IT ''' xDOI_Data = x_works.doi(xDOIStr) # ---- Results:
You may use, distribute and modify this code under the terms of the GPLv3 license. ''' import logging from crossref.restful import Works, Etiquette #from habanero import Crossref, cn try: from . import sqlitedb except: import sqlitedb #from ..gui import __version__ __version__ = 'v0.1alpha' ETIQUETTE = Etiquette('MeiTing-Trunk', __version__, 'github', '*****@*****.**') LOGGER = logging.getLogger(__name__) def fetchMetaByDOI(doi): works = Works(etiquette=ETIQUETTE) try: data = works.doi(doi) except: rec = 1 if data is None: rec = 1 else:
class Prompt(JournalAdapter): name = "Prompt" issn = '2578-9430' doi_prefix = '10.31719' uses_jats = False metadata_source = 'Crossref' base_urls = ['https://thepromptjournal.com', 'http://thepromptjournal.com'] archive_basename = 'prompt' etiquette = Etiquette('PubArchiver', pubarchiver.__version__, 'https://github.com/caltechlibrary/pubarchiver', '*****@*****.**') def all_articles(self): articles = [] try: works = Works(etiquette=Prompt.etiquette) if __debug__: log(f'asking Crossref for all works by {self.doi_prefix}') for item in works.filter(prefix=self.doi_prefix): doi = item.get('DOI', '') title = item.get('title', [''])[0] online = item.get('published-online', None) if not online or 'date-parts' not in online: if __debug__: log(f'skipping {doi} lacking published-online') continue else: date = '-'.join( format(x, '02') for x in online['date-parts'][0]) if __debug__: log(f'keeping publication {doi} dated {date}') pdf = pdf_link(item.get('link', [])) jats = '' image = '' basename = tail_of_doi(doi) status = 'complete' if all([pdf, doi, title, date ]) else 'incomplete' articles.append( Article(self.issn, doi, date, title, basename, pdf, jats, image, status)) except Exception as ex: if __debug__: log(f'crossref API exception: {str(ex)}') raise ServerError(f'Failed to get data from Crossref: {str(ex)}') return articles def articles_from(self, doi_file): '''Returns a list of `Article` tuples from a file of DOIs.''' if __debug__: log(f'reading {doi_file}') requested_dois = [] with open(doi_file, 'r') as file: requested_dois = [line.strip() for line in file] num = len(requested_dois) # I'd use pluralized() here, but it matches case when it adds a 's', # and is confused by DOI which is an acronym. Must add 's' ourselves. inform(f'Found {num} DOI{"s" if num > 1 else ""} in {doi_file}.') if not requested_dois: if __debug__: log(f'could not read any lines from {doi_file}') return [] all_articles = self.all_articles() all_dois = [article.doi for article in all_articles] skipped = 0 for doi in requested_dois: if doi not in all_dois: warn( f'Skipping "{doi}" because it is unknown for this journal.' ) skipped += 1 if skipped: kept = num - skipped inform( f'Using {kept} DOI{"s" if kept > 1 else ""} from {doi_file}.') return [ article for article in all_articles if article.doi in requested_dois ] def article_metadata(self, article): try: works = Works(etiquette=Prompt.etiquette) if __debug__: log(f'asking Crossref for data about {article.doi}') data = works.doi(article.doi) year = article.date.split('-')[0] file = tail_of_doi(article.doi) + '.pdf' field = lambda key: data.get(key, '') if isinstance(field('license'), list) and len(field('license')) > 1: rights_link = field('license')[0]['URL'] else: rights_link = 'https://creativecommons.org/licenses/by-nc/4.0/' xmldict = { 'resource': { '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'identifier': { '@identifierType': 'DOI', '#text': article.doi }, 'journal': { '#text': self.name, }, 'volume': { '#text': field('volume') }, 'issue': { '#text': field('issue') }, 'publisher': { '#text': field('publisher') }, 'publicationYear': { '#text': year }, 'e-issn': { '#text': self.issn }, 'file': { '#text': file }, 'dates': { 'date': { '#text': article.date }, }, 'titles': { 'title': { '#text': article.title } }, 'creators': { 'creator': creator_list(field('author')), }, 'descriptions': { 'description': { '@descriptionType': 'Abstract', '#text': strip_tags(field('abstract')) } }, 'rightsList': { 'rights': { '#text': copyright_text(field('author'), year), }, 'rightsURI': { '#text': rights_link } }, } } return xmldict except Exception as ex: if __debug__: log(f'crossref API exception: {str(ex)}') foo = ex import pdb pdb.set_trace()
import six.moves.urllib.request import re import shutil import tempfile import uuid from crossref.restful import Works, Etiquette import bibtexparser import papers from papers.config import cached from papers import logger from papers.encoding import family_names, latex_to_unicode my_etiquette = Etiquette('papers', papers.__version__, 'https://github.com/rchg/papers', '*****@*****.**') class DOIParsingError(ValueError): pass class DOIRequestError(ValueError): pass # PDF parsing / crossref requests # ===============================
from crossref.restful import Members, Etiquette, Works import bibclean.config.constants as constants import bibclean.crossref_tools.parser as parser my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION, constants.URL, constants.EMAIL) works = Works(etiquette=my_etiquette) members = Members(etiquette=my_etiquette) def get_editors_from_ISBN(isbn, work_type): if work_type == "book-chapter": work_type = "book" elif work_type == "proceedings-article": work_type = "proceedings" else: raise KeyError("Unknown type!") works_results = works.query(isbn).filter(type=work_type) works_results = works_results.sort("score").order("desc") for i_work in works_results: if parser.has_ISBN(i_work) and parser.has_editor(i_work): if isbn == parser.get_ISBN(i_work): return parser.get_editor(i_work) return None def get_member_ID_from_publisher(publisher_name): members_query = members.query(publisher_name).select("id")