def process_id(id_): processed_id = id_.strip() if processed_id.endswith(",") or processed_id.endswith("}"): processed_id = processed_id[:-1] id_lower = processed_id.lower() category = check_categories(processed_id) if "iv:" in id_lower: processed_id = id_lower[id_lower.index(":") + 1:] elif "iv.org" in id_lower: if category is not None: processed_id = id_lower[id_lower.index(category):] else: processed_id = id_lower[id_lower.rindex("/") + 1:] elif "doi.org" in id_lower: processed_id = processed_id[id_lower.index("doi.org/") + 8:] regexp = re.compile(r'[a-zA-Z]+[0-9]+[a-zA-Z]+') if regexp.search(processed_id): return find_file(processed_id + ".pdf") elif category is not None and processed_id.startswith(category): bibtex = postprocess_arxiv(arxiv2bib([processed_id])[0]) elif "/" in processed_id: bibtex = doi2bib(processed_id) else: bibtex = postprocess_arxiv(arxiv2bib([processed_id])[0]) return normalize(bibtex)
def get_bibtex(arxiv_id): """ Get a BibTeX entry for a given arXiv ID. .. note:: Using awesome https://pypi.python.org/pypi/arxiv2bib/ module. :param arxiv_id: The canonical arXiv id to get BibTeX from. :returns: A BibTeX string or ``None``. >>> get_bibtex('1506.06690') "@article{1506.06690v2,\\nAuthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\nTitle = {Hybridization of first and second sound in a weakly-interacting Bose gas},\\nEprint = {1506.06690v2},\\nDOI = {10.1209/0295-5075/111/40005},\\nArchivePrefix = {arXiv},\\nPrimaryClass = {cond-mat.quant-gas},\\nAbstract = {Using Landau's theory of two-fluid hydrodynamics we investigate the sound\\nmodes propagating in a uniform weakly-interacting superfluid Bose gas for\\nvalues of temperature, up to the critical point. In order to evaluate the\\nrelevant thermodynamic functions needed to solve the hydrodynamic equations,\\nincluding the temperature dependence of the superfluid density, we use\\nBogoliubov theory at low temperatures and the results of a perturbative\\napproach based on Beliaev diagrammatic technique at higher temperatures.\\nSpecial focus is given on the hybridization phenomenon between first and second\\nsound which occurs at low temperatures of the order of the interaction energy\\nand we discuss explicitly the behavior of the two sound velocities near the\\nhybridization point.},\\nYear = {2015},\\nMonth = {Jun},\\nUrl = {http://arxiv.org/abs/1506.06690v2},\\nFile = {1506.06690v2.pdf}\\n}" >>> get_bibtex('1506.06690v1') "@article{1506.06690v1,\\nAuthor = {Lucas Verney and Lev Pitaevskii and Sandro Stringari},\\nTitle = {Hybridization of first and second sound in a weakly-interacting Bose gas},\\nEprint = {1506.06690v1},\\nDOI = {10.1209/0295-5075/111/40005},\\nArchivePrefix = {arXiv},\\nPrimaryClass = {cond-mat.quant-gas},\\nAbstract = {Using Landau's theory of two-fluid hydrodynamics we investigate the sound\\nmodes propagating in a uniform weakly-interacting superfluid Bose gas for\\nvalues of temperature, up to the critical point. In order to evaluate the\\nrelevant thermodynamic functions needed to solve the hydrodynamic equations,\\nincluding the temperature dependence of the superfluid density, we use\\nBogoliubov theory at low temperatures and the results of a perturbative\\napproach based on Beliaev diagrammatic technique at higher temperatures.\\nSpecial focus is given on the hybridization phenomenon between first and second\\nsound which occurs at low temperatures of the order of the interaction energy\\nand we discuss explicitly the behavior of the two sound velocities near the\\nhybridization point.},\\nYear = {2015},\\nMonth = {Jun},\\nUrl = {http://arxiv.org/abs/1506.06690v1},\\nFile = {1506.06690v1.pdf}\\n}" """ # Fetch bibtex using arxiv2bib module try: bibtex = arxiv2bib.arxiv2bib([arxiv_id]) except HTTPError: bibtex = [] for bib in bibtex: if isinstance(bib, arxiv2bib.ReferenceErrorInfo): continue else: # Return fetched bibtex return bib.bibtex() # An error occurred, return None return None
def run(self): """Produce output and error messages""" keys = [] # collect all citations for filename in self.args.filenames: try: with open(filename) as f: keys = keys + self._read(f) except IOError as e: self.messages.append("Could not open %s" % filename) bib = [] arXiv = set(filter(arxiv2bib.is_valid, keys)) # arxiv2bib does all keys at once bib = bib + [b for b in arxiv2bib.arxiv2bib(arXiv)] MR = set(filter(mr2bib.is_valid, keys)) # mr2bib is key per key try: bib = bib + [b for b in mr2bib.mr2bib(MR)] except mr2bib.AuthenticationException: self.messages.append("Not authenticated to Mathematical Reviews") for b in bib: if isinstance(b, arxiv2bib.ReferenceErrorInfo) or isinstance( b, mr2bib.ReferenceErrorInfo): self.error_count += 1 else: self.output.append(b.bibtex()) self.code = self.tally_errors(bib)
def arxiv_parser(filename): from arxiv2bib import arxiv2bib id_ = filename.split("/")[-1].split(".pdf")[0] ref = arxiv2bib([id_])[0] output = {} output['Author'] = ref.authors[-1].split(" ")[-1] output['Year'] = ref.years output['Journal'] = 'arXiv'
def get_arxiv_bibtex(arxiv_id): """ Use the arxiv2bib package to get the BibTex for an arXiv preprint. """ import arxiv2bib ref, = arxiv2bib.arxiv2bib([arxiv_id]) if isinstance(ref, arxiv2bib.ReferenceErrorInfo): raise KeyError(f'{ref.message}: {arxiv_id}') return ref.bibtex()
def pdf_scraper(filename, info): doi_pattern = [ 'DOI:', "doi:", "http://dx.doi.org/", "doi/", "DOI ", "https://doi.org/" ] output = pdf2txt(filename) parsed_info = {} print(filename) for doi_string in doi_pattern: if output.find(doi_string) != -1: doi_position = output.find(doi_string) + len(doi_string) doish = output[doi_position:doi_position + 50] doi = doish.split("\n")[0] if len(doi) == 0 or doi.find("/") == -1: doi = doish.split("\n")[1] for delimiter in [' ', ',', ')', '\t', 'http:']: doi = doi_splitter(doi, delimiter) if doi[-1] == '.': doi = doi[:-1] print("parsed doish", filename, repr(doish), repr(doi)) parsed_info = requests_by_doi(doi) if parsed_info != {}: break if "_arXiv.pdf" in filename and parsed_info == {}: from arxiv2bib import arxiv2bib offset = len('arXiv:') idx = output.find('arXiv:') + offset if idx == -1: return {} id_ = output[idx + offset:idx + offset + 10] ref = arxiv2bib([id_])[0] author = (",").join([a.split(" ")[-1] for a in ref.authors]) title = ref.title parsed_info['FullEntry'] = author + " " + title return parsed_info
def get_bib(doi, filename=None): """ get bib from crossref.org and arXiv.org """ if doi is None: return False, None if not isinstance(doi, str): return False, None found = False bib = None # for arXiv:XXXX case if doi.lower()[:5] == "arxiv": doi = doi[6:] bib_object = arxiv2bib([doi]) bib = bib_object[0].bibtex() if len(bib) > 0: found = True bib = bib_to_dict(bib) else: found = False # for crossref else: bare_url = "http://api.crossref.org/" url = "{}works/{}/transform/application/x-bibtex" url = url.format(bare_url, doi) r = requests.get(url) found = False if r.status_code != 200 else True bibtex_str = str(r.content, "utf-8") if bibtex_str.find("Resource not found") == -1: if filename is not None: with open(filename, "w") as f: f.write(bibtex_str) bib = bib_to_dict(bibtex_str) else: found = False return found, bib
def arXiv2Bib(arxiv): """Returns bibTeX string of metadata for a given arXiv id arxiv is an arxiv id """ bibtex = arxiv_metadata.arxiv2bib([arxiv]) for bib in bibtex: if isinstance(bib, arxiv_metadata.ReferenceErrorInfo): continue else: fetched_bibtex = BibTexParser(bib.bibtex()) fetched_bibtex = fetched_bibtex.get_entry_dict() fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]] try: del(fetched_bibtex['file']) except KeyError: pass return tools.parsed2Bibtex(fetched_bibtex) return ''
def arXiv2Bib(arxiv): """Returns bibTeX string of metadata for a given arXiv id arxiv is an arxiv id """ bibtex = arxiv_metadata.arxiv2bib([arxiv]) for bib in bibtex: if isinstance(bib, arxiv_metadata.ReferenceErrorInfo): continue else: fetched_bibtex = bibtexparser.loads(bib.bibtex()) fetched_bibtex = fetched_bibtex.entries_dict fetched_bibtex = fetched_bibtex[list(fetched_bibtex.keys())[0]] try: del(fetched_bibtex['file']) except KeyError: pass return tools.parsed2Bibtex(fetched_bibtex) return ''
def setUp(self): fakedata.start() result = a2b.arxiv2bib(['1011.9999', '1001.1001v1', 'x', '1205.1001']) self.not_found, self.judge, self.invalid, self.frv = result
elif 'published-online' in entry: dp = entry['published-online']['date-parts'][0] year, month, *a = dp doi = entry['DOI'] url = entry['URL'] journal = entry['container-title'] if isinstance(journal, list): journal = journal[0] yearmonth = '%04i-%02i' % (year, month) return {'title': title, 'authors': authors, 'year': year, 'month': month, 'doi': doi, 'url': url, 'journal': journal, 'yearmonth': yearmonth} for ref in pubs['dois']: found, val = doi2bib.get_json(ref) if found: bibs.append(convert_doientry(val['message'])) arxivs = arxiv2bib.arxiv2bib(pubs['arxiv']) for ref in arxivs: month = m.index(ref.month) + 1 yearmonth = '%04i-%02i' % (int(ref.year), month) bibs.append( {'title': ref.title, 'authors': ref.authors, 'year': ref.year, 'month': month, 'yearmonth': yearmonth, 'url': ref.url, 'journal': "arXiv"} ) with open("_data/citations.yml", "w") as f: yaml.dump(bibs, f, default_flow_style=False)
def try_get_bib(url, acl_anthology_by_id, acl_anthology_by_title): """ Gets a URL to a publication and tries to extract a bib entry for it Returns None if it fails (e.g. if it's not a publication) :param url: the URL to extract a publication from :param acl_anthology_by_id: a dictionary of publications by ID (from ACL anthology) :param acl_anthology_by_title: a dictionary of publications by title (from ACL anthology) :return: a tuple of (bib entry, tuple) or None if not found / error occurred """ lowercased_url = url.lower() filename = lowercased_url.split('/')[-1] # Only try to open papers with extension pdf or bib or without extension if '.' in filename and not filename.endswith( '.pdf') and not filename.endswith('.bib'): return None # If ends with bib, read it if filename.endswith('.bib'): bib_entry = urllib.request.urlopen(url).read().decode('utf-8') title = get_title_from_bib_entry(bib_entry) return (bib_entry, title) paper_id = filename.replace('.pdf', '') # Paper from TACL if 'transacl.org' in lowercased_url or 'tacl' in lowercased_url: result = get_bib_from_tacl(paper_id) if result is not None: try: bib_entry, title = result title = normalize_title(title) return (bib_entry, title) except: pass # If arXiv URL, get paper details from arXiv if 'arxiv.org' in lowercased_url: results = arxiv2bib([paper_id]) if len(results) > 0: try: result = results[0] title = normalize_title(result.title) # First, try searching for the title in the ACL anthology. If the paper # was published in a *CL conference, it should be cited from there and not from arXiv bib_entry = acl_anthology_by_title.get(title, None) # Not found in ACL - take from arXiv if bib_entry is None: bib_entry = result.bibtex() except: pass if bib_entry: return (bib_entry, title) # If the URL is from the ACL anthology, take it by ID if 'aclanthology' in lowercased_url or 'aclweb.org' in lowercased_url: bib_entry = acl_anthology_by_id.get(paper_id.upper(), None) if bib_entry: title = get_title_from_bib_entry(bib_entry) return (bib_entry, title) # If the URL is from Semantic Scholar if 'semanticscholar.org' in lowercased_url and not lowercased_url.endswith( 'pdf'): result = get_bib_from_semantic_scholar(url) if result is not None: try: semantic_scholar_bib_entry, title = result title = normalize_title(title) # First, try searching for the title in the ACL anthology. If the paper # was published in a *CL conference, it should be cited from there and not from Semantic Scholar bib_entry = acl_anthology_by_title.get(title, None) # Not found in ACL - take from Semantic Scholar if bib_entry is None: bib_entry = semantic_scholar_bib_entry return (bib_entry, title) except: pass # Else: try to read the pdf and find it in the acl anthology by the title if lowercased_url.endswith('pdf'): # Download the file to a temporary file data = urllib.request.urlopen(url).read() with open('temp.pdf', 'wb') as f_out: f_out.write(data) result = get_from_pdf('temp.pdf', acl_anthology_by_title) if result is not None: bib_entry, title = result title = normalize_title(title) return (bib_entry, title) # Didn't find logger.warning('Could not find {}'.format(url)) return None
citationkeylist.append(citationkey) citationkeylist = list(set(citationkeylist)) # ============================================================================= # Classement # ============================================================================= # Liste de reference provenant d'arXiv list_arxiv_id = [] for doi in citationkeylist: if arxiv2bib.is_valid(doi): list_arxiv_id.append(doi) refs_arxiv = arxiv2bib.arxiv2bib(list_arxiv_id) list_text_refs_arxiv = [arxref.bibtex() for arxref in refs_arxiv] # liste de reference au format doi list_doi_refs = [] doi_pattern = '(10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)' for doi in citationkeylist: if re.search(doi_pattern, doi): list_doi_refs.append(doi) list_text_refs_doi = [pydoi2bib(doi) for doi in list_doi_refs] # Ajout des references doi et arxiv au fichier texte final if len(list_text_refs_arxiv) > 0: text_complet += ' \n \n ' + ' \n \n '.join(
new_bibkey = list(new_bibkey) doi = ones_like(biblist) print bib j = 0 bibfile = codecs.open('../biblio/' + name + '.bib', 'w', 'utf-8') if len(bib) == 0: bibfile.close() # bibfile=open('../biblio/'+name+'.bib','w') for i, ref in enumerate(biblist): old_bibkey[i] = ref[ref.find("[") + 1:ref.find("]")] doi[i] = ref.split(' ')[-1] doi[i] = doi[i].replace('https://doi.org/', '') doi[i] = doi[i].replace('https://journals.aps.org/prb/abstract/', '') if doi[i][:5] == 'arXiv': try: out = a2b.arxiv2bib([doi[i].split(':')[-1]]) bibitem = out[0].bibtex() except urllib2.HTTPError: bibitem = '' else: try: print 'searching DOI:', doi[i] bibitem = doi2bib(doi[i]) except urllib2.HTTPError: bibitem = '' y, z = bibitem.split(",", 1) x, key = y.split("{", 1) if key in new_bibkey: new_bibkey[i] = key + alphabet[j] j += 1 bibitem = bibitem.replace(key, new_bibkey[i])