def recursiveReferenceAdd(sql, cursor, item): works = Works() returnId = -1 # Check for duplicates in the database tableEntry = isInTable(cursor,item) if tableEntry[0] == False: returnId = addToResearchTable(cursor,item) else: print "Got duplicate" returnId = tableEntry[1] return -1#returnId #TODO: Allow a duplicate to check the references and update them # Continue checking references recursively if 'reference' in item: for i in xrange(len(item['reference'])): if 'doi' in item['reference'][i]: if item['reference'][i]['doi'] != None: returnId2 = recursiveReferenceAdd(sql,cursor,works.doi(item['reference'][i]['doi'])) # Add relations between research papers if (returnId != -1 and returnId2 != -1): addToReferenceTable(cursor, returnId, returnId2) elif 'DOI' in item['reference'][i]: if item['reference'][i]['DOI'] != None: returnId2 = recursiveReferenceAdd(sql,cursor,works.doi(item['reference'][i]['DOI'])) #Add relations between research papers if (returnId != -1 and returnId2 != -1): addToReferenceTable(cursor, returnId, returnId2) sql.commit() return returnId
def get_xref(doi): xrefapi = Works() try: xref_data = xrefapi.doi(doi) if not xref_data: print "Could not get data for DOI:", doi print xrefapi.doi(doi) return xref_data except ValueError as e: print "Error: {}".format(e)
def summarydo(doi): summary_file = {'info': {}, 'cit': {}, 'ref': {}} wk = Works() item = wk.doi(doi) info = parse_info(doi, item) summary_file['info'] = info print('info is ok') works = Works() item = works.doi(doi) if ('reference' in item.keys()): references = item['reference'] for reference in references: if ('DOI' in reference.keys()): ref_doi = reference['DOI'] summary_file['ref'][ref_doi] = {} wk = Works() ref_item = wk.doi(ref_doi) ref_info = parse_info(ref_doi, ref_item) summary_file['ref'][ref_doi] = ref_info else: print("无引用信息" + doi) print('ref is ok') web_url = r'http://xueshu.baidu.com/s?wd=' + doi \ + r'&rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=1&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_n=2' res = urllib.request.urlopen(web_url) html = res.read().decode('utf-8') matchObj = re.findall('lineMapCitedData = \[(.*)\]', html, re.M | re.I) try: cited = re.findall('"cited":([^,]+)', matchObj[0], re.M | re.I) year = re.findall('"year":([^\}]+)', matchObj[0], re.M | re.I) total = re.findall('"total":([^,]+)', matchObj[0], re.M | re.I) length = len(cited) for i in range(0, length): ctmp = cited[i].replace('"', '') ctmp = int(ctmp) ttmp = total[i].replace('"', '') ttmp = int(ttmp) summary_file['cit'][year[i]] = {'cited': ctmp, 'total': ttmp} print('cit is ok') except: print('baidu未收录cit信息') currentPath = os.getcwd() + '\\media\\sum\\' json_output = currentPath + doi.replace("/", "_") + ".json" with open(json_output, "w", encoding='utf-8') as f: f.write(json.dumps(summary_file, ensure_ascii=False)) # json.dump(json_file, f) print("加载入json文件完成...")
def vis_cit(request): keyword = request.POST.get('keyword') cit_doi = doi_trans(keyword) if (cit_doi == None): signal = 'TitleError' return HttpResponse(signal) else: # 查找对应的文件名,如1,2,3 cit_file_id = file_find(cit_doi) print(cit_file_id) #查找cit数据库中记录 cit_flag = cit_find(cit_doi) if (cit_flag == 'existed'): print("cit数据库中找到记录") return HttpResponse(cit_file_id) else: print("cit数据库中无记录") wk = Works() item = wk.doi(cit_doi) if (item == None): signal = 'DoiError' else: citDo.citdo(cit_doi, cit_file_id) cit_record = Cit(cit_doi=cit_doi, create_ctime=datetime.now()) cit_record.save() return HttpResponse(cit_file_id)
def get_crossref_metadata(file_name): # work needs to be defined for crossref works = Works() # open the json that contains the full list of dois of which we need metadata with open(file_name, "r", encoding="utf-8") as journals_articles: json_data = json.load(journals_articles) # we access one by one the json objects that are divided by journals. within the journal, we access the values for research articles for object in json_data: id_list_research = object["research_articles"] # Define the file name for the metadata json final file new_file_name = ( f'{object["journal_title"]}_crossref_metadata.json').replace( " ", "_").replace("/", "").replace(":", "") with open(f'../data/json_files/crossref_api/{new_file_name}', "a", encoding="utf-8") as fd: fd.write("[") # Make api request to crossref for each of the dois present in the list for index, id in enumerate(id_list_research): # the following line makes the request to crossref record = works.doi(id) # Write the final json with append mode json.dump(record, fd) if index < len(id_list_research) - 1: fd.write(",") fd.write("]")
def parse_reference(cls, reference, bibtex_parser=None): ret = None crossref_match = const.DOI_RE.search(reference) if crossref_match: works = Works(etiquette=const.CROSSREF_ETIQUETTE) doi = works.doi(crossref_match.group(0)) if doi: ret = {'raw_reference': reference} ret['doi'] = crossref_match.group(0) ret['author'] = '' if 'author' in doi: ret['author'] = ', '.join([ f'{author.get("given", "")} {author.get("family", "")}' for author in doi['author'] ]) if 'title' in doi and doi["title"]: ret['title'] = doi['title'][0] else: logger.warning(f"No Title available for {crossref_match} ") return None if 'container-title' in doi and doi['container-title']: ret['journal'] = doi['container-title'][0] if 'volume' in doi: ret['volume'] = doi['volume'][0] if 'published-online' in doi: ret['year'] = doi['published-online']['date-parts'][0][0] return ret
def cite_it(bot, chat_id, doi): """Returns citation for given DOI""" # headers = {"content-type":"application/x-bibtex"} # resp = requests.get("https://doi.org/" + DOI, headers=headers) # return resp.content works = Works() if not works.agency(doi): bot.send_message( chat_id=chat_id, text="Этот документ не входит в базу цитирования CrossRef...") return SEARCH_RESULTLS else: record = works.doi(doi) found, meta_bib = get_bib(doi) if not found: bot.send_message(chat_id=chat_id, text="Документ не найден...") return SEARCH_RESULTLS bot.send_message(chat_id=chat_id, text="Цитирование по CrossRef:") filename = doi.replace('/', '-') with open(os.path.join('downloads', filename + '.bib'), 'w+') as downloaded_file: downloaded_file.write(meta_bib) bot.send_document( chat_id=chat_id, document=open(os.path.join('downloads', filename + '.bib'), 'rb'), ) return SEARCH_RESULTLS
def get_info_by_DOI(DOI: str) -> Dict: '''This function takes a DOI str, requests information about the corresponding article via metapub or crossref and checks if all necessary information has been retrieved.''' article_dict = {} fetch = PubMedFetcher() try: article = fetch.article_by_doi(DOI) # Save information in Dict for info in dir(article): if info[0] != '_': article_dict[info] = eval('article.' + info) # Add data retrieval info to the dict article_dict = add_retrieval_information(article_dict, 'MetaPub', 'DOI', DOI) except MetaPubError: # If it does not work via Metapub, do it via Crossref Api # If there is a timeout, try again (5 times) for _ in range(5): try: works = Works() article_dict = works.doi(DOI) break except: pass #article_dict = normalize_crossref_dict(article_dict) # Add data retrieval info to the dict #if contains_minimal_information(article_dict): article_dict = add_retrieval_information(article_dict, 'Crossref', 'DOI', DOI) return article_dict
def checkDoiJournalArticle(doi): isJournal = "" publicationDate = 0 printDate = 9999 onlineDate = 9999 works = Works() author = [] try: data = works.doi(doi) if 'type' in data: if data['type'] == 'journal-article': isJournal = doi if 'author' in data: author = data['author'] if 'published-print' in data: printDate = data['published-print']['date-parts'][0][0] if 'published-online' in data: onlineDate = data['published-online']['date-parts'][0][0] publicationDate = min(printDate, onlineDate) return isJournal, publicationDate, doi, author except KeyboardInterrupt: exit() except: print('DOI NOT FOUND: ', doi) return isJournal, publicationDate, doi, author
def getRefByDOI(doi, debug=False): works = Works() ref = works.doi(doi) ref = {key: value for key, value in ref.items() if key in ALL_BIBTEX_ATTR} ref['newrender'] = renderBibFromDict(ref) if ref: return convertDictToBibStr(ref)
def getTitle(eventdoi): works = Works() adoi = eventdoi metainfo = works.doi(adoi) title = metainfo['title'] title = ''.join(title) return title
def vis_ref(request): keyword = request.POST.get('keyword') ref_doi = doi_trans(keyword) if (ref_doi == None): signal = 'TitleError' return HttpResponse(signal) else: # 查找对应的文件名,如1,2,3 ref_file_id = file_find(ref_doi) print(ref_file_id) # 查找ref数据库中记录 ref_flag = ref_find(ref_doi) if (ref_flag == 'existed'): print("ref数据库中找到记录") return HttpResponse(ref_file_id) else: print("ref数据库中无记录") wk = Works() item = wk.doi(ref_doi) if (item == None): signal = 'DoiError' else: refDo2.refdo(ref_doi, ref_file_id) ref_record = Ref(ref_doi=ref_doi, create_rtime=datetime.now()) ref_record.save() return HttpResponse(ref_file_id)
def form_valid(self, form): self.publication = form.save(commit=False) self.publication.created_by = self.request.user works = Works() if self.publication.DOI != "" and works.doi_exists(self.publication.DOI): paper_data_result = works.doi(self.publication.DOI) self.publication.publication_year = str(paper_data_result.get('created').get('date-parts')[0][0]) self.publication.title = paper_data_result.get('title')[0] self.publication.author = f"{paper_data_result.get('author')[0].get('given')},{paper_data_result.get('author')[0].get('family')}" sub = paper_data_result.get("subject", [self.publication.subject]) self.publication.subject = ', '.join([str(elem) for elem in sub]) self.publication.URL = paper_data_result.get('URL') elif self.publication.ISBN != "" and is_isbn13(self.publication.ISBN): book_data_result = meta(self.publication.ISBN) self.publication.publication_year = book_data_result.get('Year') self.publication.title = book_data_result.get('Title') self.publication.author = book_data_result.get('Authors')[0] elif self.publication.crossref and (self.publication.DOI or self.publication.ISBN): messages.error(self.request, 'DOI/ISBN no encontrado. Cargar datos y desmarcar el campo "tiene DOI/ISBN"') return render(self.request, 'bibliography/publication_form.html', {'form': form}) self.publication.save() messages.success(self.request, "Registro realizado con exito") return redirect('bibliography:publication_detail', pk=self.publication.pk)
def getMetadataFromDOI(id): works = Works() metadata = works.doi(id) if metadata is None: s = CrossRefClient() metadata = s.doi2json(id) metadata.pop('id', None) return metadata
class DOIMetadata(SqliteDataResource): """ This resource is a cached set of DOI metadata pulled from Crossref's API for the DOIs saved in the OutputDOIs resource. Having this cached speeds up the ETL processing and reduces our calls to the Crossref API which is the nice thing to do. """ def __init__(self, context): super().__init__(context, DataResource.data_dir / 'doi_metadata.db') etiquette = Etiquette( 'SYNTH transform', '0.1', 'https://github.com/NaturalHistoryMuseum/synth_transform', '*****@*****.**') self.works = Works(etiquette=etiquette) self._handled = set() # all the dois that are checked in this run self._added = set() # all the dois that are added in this run self._errors = {} def _get_metadata(self, conn, doi): """ Retrieve metadata for a single DOI and add it to a SqliteDataResource with an open SQLiteDict. :param conn: SqliteDataResource with an open SQLiteDict, e.g. 'self' within 'with self:' :param doi: the DOI to search crossref for """ if doi is None: return self._handled.add(doi) try: doi_metadata = self.works.doi(doi) if doi_metadata: conn.add(doi_metadata['DOI'].upper(), doi_metadata) self._added.add(doi) except Exception as e: self._errors[doi] = e def update(self, context, target, *synth_sources): """ Retrieve and store metadata for each the DOIs stored in the OutputDOIs resource. """ with self: super(DOIMetadata, self).update(context, target, *synth_sources) self._handled = set() self._added = set() self._errors = {} doi_cache = OutputDOIs(context) with doi_cache: found_dois = list(set(doi_cache.data.values())) workers = context.config.resource_opt('doimetadata.threads', 20) with self, ThreadPoolExecutor(workers) as executor: thread_map(lambda x: self._get_metadata(self, x), found_dois, desc='Crossref', unit=' dois', leave=False, position=1)
def cross_doi(publication): works = Works() paper_data_result = works.doi(publication.DOI) publication.publication_year = str( paper_data_result.get('created').get('date-parts')[0][0]) publication.title = paper_data_result.get('title')[0] publication.author = f"{paper_data_result.get('author')[0].get('given')},{paper_data_result.get('author')[0].get('family')}" publication.subject = paper_data_result.get("subject", [publication.subject])[0] publication.URL = paper_data_result.get('URL')
def request_doi_api(): works = Works() address = parameters["source"][prop] data = works.doi(address) if data: value = data["URL"] to_insert = QLabel("<html><a href=\"" + value + "\">" + value + "</a></html>") else: to_insert = None return to_insert
def doi_BibJSON_pipeline(dois): """ Creates a list of BibJSON objects from a list of DOIs passed """ works = Works() BibJSON_list = [] for doi in dois: crossref_obj = works.doi(doi) BibJSON_list.append(json.dumps(create_BibJSON(crossref_obj, doi))) return BibJSON_list
def download_details(doi_list): works = Works() details = [] for j, doi in enumerate(doi_list): doi = doi.strip() if doi[0] == '#': continue work = works.doi(doi) work['doi'] = doi details.append(standarize(work)) return details
def get_crossref_records(dois): works = Works() crossref_records = {} print('searching crossref for all DOIs, this might take a few minutes...') for doi in dois: r = works.doi(doi) if r is not None: crossref_records[doi] = r else: print('missing crossref record for', doi) return (crossref_records)
def check_doi(self, doi): works = Works() response = works.doi(doi) is_valid = False status = None if response is not None: status = response[self.crossref_title][0] if status != self.status_inactive_doi: is_valid = True return is_valid
def on_click_fetch(event): """Get metadata for DOI, and return an error if the DOI is not valid (no metadata found).""" import dateutil from crossref.restful import Works import json # turn the "Add paper" primary, to remember clicking it again! btn_add_paper.button_type = 'primary' # Input DOI: (1) if empty use test DOI (2) If Angewandte German Edition, change with International Edition inp_doi.value = inp_doi.value.replace("ange","anie") or "10.1021/jacs.9b01891" works = Works() print("Querying Crossref API for doi {} (this can take several seconds, depending on the server...)".format(inp_doi.value)) metadata = works.doi(inp_doi.value) print("Query done!") #print(json.dumps(metadata,sort_keys=True, indent=4)) # Use for debug! if not metadata: btn_doi.button_type = 'danger' inp_title.value = inp_year.value = inp_reference.value = inp_paper_id.value = "ERROR: wrong/missing DOI." return inp_title.value = str(metadata['title'][0]) journal = str(metadata['short-container-title'][0]) if 'volume' in metadata: already_in_issue = True volume = metadata['volume'] if 'published-print' in metadata: # ACS, wiley year = str(metadata['published-print']['date-parts'][0][0]) elif 'created' in metadata: # RSC year = str(metadata['created']['date-parts'][0][0]) else: year = 'ERROR: year not found.' else: # not yet in an issue: assuming that it will be published at the same year of today already_in_issue = False year = str(datetime.datetime.now().year) inp_year.value = year if already_in_issue: if 'page' in metadata: # most of the journals inp_reference.value = "{}, {}, {}, {}".format(journal, year, volume, metadata['page']) else: # NatComm or not yet in an issue inp_reference.value = "{}, {}, {}".format(journal, year, volume) else: inp_reference.value = "{}, {}, {}".format(journal, year, "in press") inp_paper_id.value = mint_paper_id(doi=inp_doi.value, year=inp_year.value) btn_doi.button_type = 'success'
def main(): #Tracks the row that is entered in from the DOI list. currentRows = 0 with open('DOIValues.csv', newline='') as csvFile: #A line is read in from file instead of the whole file in order to be memory efficient lineIn = csv.reader(csvFile) #Look continues while there are left to process in the .csv file while currentRows < csvLineCount: #Reads in the next line csvRow = next(lineIn) StringConvert = "" csvLineString = StringConvert.join(csvRow) try: #Creates a "works" object from the Crossref metadata API from crossref.restful import Works works = Works() #Passes DOI from the .csv file to Crossref to get all metadata info that is available doiMetaData = works.doi(csvLineString) #If author information is found in the "works" obejct then it passed to the "authorMetaDataIngest.py" for processing # if (doiMetaData['author']): # authorInfo = doiMetaData['author'] # print("Author information for DOI: " + csvLineString + " found") # authorMetaDataIngest.authorIngest(connection, cursor, csvLineString, authorInfo) if (doiMetaData['content-domain']): contentDomainInfo = doiMetaData['content-domain'] print("Content Domain information for DOI: " + csvLineString + " found") contentDomainMetaDataIngest.contentDomainIngest( connection, cursor, csvLineString, contentDomainInfo) except ImportError: print("Installation of the Crossref API is needed") except: print("Unknown Error") #Increases counter to keep track of whether at the end of .csv file currentRows += 1 if currentRows > csvLineCount: currentRows = csvLineCount
def run(self): while True: if (self.doi_queue.empty() == False): tmp = self.doi_queue.get() doi = tmp.split('+')[0] cat = int(tmp.split('+')[1]) try: wk = Works() item = wk.doi(doi) except: print('网络状态不良好' + doi + '请求失败') item = 'network-error' if (item == None): print(doi + 'crossref未收录该doi') item = 'doi-error' if (item != 'network-error' and item != 'doi-error'): info = parseInfo.parse_info(doi=doi, item=item) self.lock.acquire() if ('subject' in info.keys()): subject_list = info['subject'].split(',') #记录和统计subject信息 for s in subject_list: if (s in self.ref_file['statistics'] ['subject'].keys()): self.ref_file['statistics']['count'][s] += 1 l = len( self.ref_file['statistics']['subject'][s]) self.ref_file['statistics']['subject'][s][ l] = info else: self.ref_file['statistics'][ 'subjectCount'] += 1 self.ref_file['statistics']['count'][s] = 1 self.ref_file['statistics']['subject'][s] = {} l = len( self.ref_file['statistics']['subject'][s]) self.ref_file['statistics']['subject'][s][ l] = info self.lock.release() if (cat == 0): self.ref_file['info'] = info else: length = len(self.ref_file['nodes']) info['cat'] = cat self.ref_file['nodes'][length] = info if (cat < 2): parse_ref(item, cat, doi, self.ref_file, self.doi_queue) self.doi_queue.task_done()
def fetch_pubmed_info(self, pmid, pubmed_cache_key): target, uid = pmid.split(':', 1) if target == 'PMID': # Actually make the request postUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" postData = { "db": "pubmed", "id": uid, "version": "2.0", "retmode": "json" } response = requests.post(postUrl, data=postData) # When you get the request back if not response.ok: # logger.debug(f'Pubmed returned a bad response for {uid}, status code {response.status_code}') if response.status_code == 429: return 'Too many robokop pubmed requests' return f'Unable to complete pubmed request, pubmed request status {response.status_code}' pubmed_payload = response.json() if not pubmed_payload or 'result' not in pubmed_payload: # logger.debug(f'Pubmed returned a bad json response for {uid}, response json {pubmed_payload}') return 'Unable to complete pubmed request, bad pubmed response' pubmed_result = pubmed_payload['result'] if uid not in pubmed_result: # logger.debug(f'Pubmed returned a bad json result for {uid}, result {pubmed_result}') return 'Unable to complete pubmed request, bad pubmed result' pubmed_result = pubmed_result[uid] elif target == 'DOI': # https://github.com/fabiobatalha/crossrefapi works = Works() pubmed_result = works.doi(uid) # logger.debug(f'DOI response {pubmed_result}') else: return 'Unsupported publication type.' pubmed_redis_client = redis.Redis( host=os.environ['PUBMED_CACHE_HOST'], port=os.environ['PUBMED_CACHE_PORT'], db=os.environ['PUBMED_CACHE_DB'], password=os.environ['PUBMED_CACHE_PASSWORD']) pubmed_redis_client.set(pubmed_cache_key, json.dumps(pubmed_result)) # logger.debug(f'Pubmed response is now cached for pmid {pmid}') return 'cached'
def get_complement(self, doi): try: works = Works() response = works.doi(doi) year = None cited_by = None if response is not None: year = response[self.crossref_created][ self.crossref_created_date_parts][0][0] cited_by = response[self.crossref_cited_by] return year, cited_by except Exception as e: return None, None
def _parse_document_type(self, doc): """ Returns the document type of a document as a <class 'str'>. e.g. 'paper', 'clinical_trial', 'patent', 'news'. """ try: doi = self._parse_doi(doc) works = Works() doc_type = works.doi(doi)['type'] if doc_type == 'book-chapter': return 'chapter' else: return 'paper' except: return 'paper'
def resolve_doi(doi: str) -> dict: works = Works() data = works.doi(doi) names: list = get_names_from_doi(data) journal = get_journal_name(data) paper_title = get_paper_title(data) year = get_publication_year(data) page = get_first_page(data) return { '_publ_author_name': names, '_journal_name_full': journal, '_publ_section_title': paper_title, '_journal_year': year, '_journal_page_first': page, }
def fetchMetaByDOI(doi): works = Works(etiquette=ETIQUETTE) try: data = works.doi(doi) except: rec = 1 if data is None: rec = 1 else: rec = 0 LOGGER.info('DOI = %s. Rec of doi query = %s' % (doi, rec)) return rec, data
def get_apa(doi): works = Works() output = works.doi(doi) if output == None: return "Not available" authors = output['author'] length = len(authors) citation = "" for i in range(length): citation = citation + authors[i]['family'] + ", " + authors[i][ 'given'][0] + "., " citation = citation + "({}). ".format( output['published-print']['date-parts'][0][0]) + "{}. ".format( output['title'][0] ) + output['publisher'] + ", " + "{0}({1}), {2}. doi: {3}".format( output['volume'], output["issue"], output['page'], doi) return citation