def post(self, request, *args, **kwargs): unique_id = request.POST.get('unique_id') project_pk = self.kwargs.get('project_pk') project_obj = get_object_or_404(Project, pk=project_pk) matching_source_obj = None for source in PublicationSource.objects.all(): if source.name == 'doi': try: status, bib_str = crossref.get_bib(unique_id) bp = BibTexParser(interpolate_strings=False) bib_database = bp.parse(bib_str) bib_json = bib_database.entries[0] matching_source_obj = source break except: continue elif source.name == 'adsabs': try: url = 'http://adsabs.harvard.edu/cgi-bin/nph-bib_query?bibcode={}&data_type=BIBTEX'.format( unique_id) r = requests.get(url, timeout=5) bp = BibTexParser(interpolate_strings=False) bib_database = bp.parse(r.text) bib_json = bib_database.entries[0] matching_source_obj = source break except: continue if not matching_source_obj: return render(request, self.template_name, {}) year = as_text(bib_json['year']) author = as_text(bib_json['author']).replace( '{\\textquotesingle}', "'").replace('{\\textendash}', '-').replace( '{\\textemdash}', '-').replace('{\\textasciigrave}', ' ').replace('{\\textdaggerdbl}', ' ').replace('{\\textdagger}', ' ') title = as_text(bib_json['title']).replace( '{\\textquotesingle}', "'").replace('{\\textendash}', '-').replace( '{\\textemdash}', '-').replace('{\\textasciigrave}', ' ').replace('{\\textdaggerdbl}', ' ').replace('{\\textdagger}', ' ') author = re.sub("{|}", "", author) title = re.sub("{|}", "", title) context = {} context['author'] = author context['year'] = year context['title'] = title context['unique_id'] = unique_id context['source'] = matching_source_obj context['project_pk'] = project_obj.pk return render(request, self.template_name, context)
def _search_id(self, unique_id): matching_source_obj = None for source in PublicationSource.objects.all(): if source.name == 'doi': try: status, bib_str = crossref.get_bib(unique_id) bp = BibTexParser(interpolate_strings=False) bib_database = bp.parse(bib_str) bib_json = bib_database.entries[0] matching_source_obj = source break except: continue elif source.name == 'adsabs': try: url = 'http://adsabs.harvard.edu/cgi-bin/nph-bib_query?bibcode={}&data_type=BIBTEX'.format( unique_id) r = requests.get(url, timeout=5) bp = BibTexParser(interpolate_strings=False) bib_database = bp.parse(r.text) bib_json = bib_database.entries[0] matching_source_obj = source break except: continue if not matching_source_obj: return False year = as_text(bib_json['year']) author = as_text(bib_json['author']).replace('{\\textquotesingle}', "'").replace('{\\textendash}', '-').replace( '{\\textemdash}', '-').replace('{\\textasciigrave}', ' ').replace('{\\textdaggerdbl}', ' ').replace('{\\textdagger}', ' ') title = as_text(bib_json['title']).replace('{\\textquotesingle}', "'").replace('{\\textendash}', '-').replace( '{\\textemdash}', '-').replace('{\\textasciigrave}', ' ').replace('{\\textdaggerdbl}', ' ').replace('{\\textdagger}', ' ') author = re.sub("{|}", "", author) title = re.sub("{|}", "", title) # not all bibtex entries will have a journal field if 'journal' in bib_json: journal = as_text(bib_json['journal']).replace('{\\textquotesingle}', "'").replace('{\\textendash}', '-').replace( '{\\textemdash}', '-').replace('{\\textasciigrave}', ' ').replace('{\\textdaggerdbl}', ' ').replace('{\\textdagger}', ' ') journal = re.sub("{|}", "", journal) else: # fallback: clearly indicate that data was absent source_name = matching_source_obj.name journal = '[no journal info from {}]'.format(source_name.upper()) pub_dict = {} pub_dict['author'] = author pub_dict['year'] = year pub_dict['title'] = title pub_dict['journal'] = journal pub_dict['unique_id'] = unique_id pub_dict['source_pk'] = matching_source_obj.pk return pub_dict
def pull(self): user_agents = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/35.0.1916.114 Safari/537.36' ] headers = {'User-Agent': " ".join(user_agents)} within = 'owners%%2Eowner%%3DHOSTED' sort = '%%5Fscore' export_format = 'bibtex' url_template = 'https://dl.acm.org/exportformats_search.cfm?query=%s&within=%s&srt=%s&expformat=%s' result = BibDatabase() for query in self.queries: url = url_template % (query, within, sort, export_format) response = requests.get(url, cookies=self.cookies, headers=headers) self.cookies.update(response.cookies) bibtex_parser = BibTexParser(customization=convert_to_unicode) result.get_entry_list().append( bibtex_parser.parse(response.text).get_entry_list()) return result
def read_bibtex(bibtex_str): parser = BibTexParser(common_strings=True) parser.ignore_nonstandard_types = False parser.homogenize_fields = True bib_database = parser.parse(bibtex_str) keyworded = map(bibtexparser.customization.keyword, bib_database.entries) converted = list(map(bibtexparser.customization.convert_to_unicode, keyworded)) authored = map(bibtexparser.customization.author, converted) return list(authored)
def parse_bibtex(reference, bibtex_parser=None): if bibtex_parser is None: bibtex_parser = BibTexParser() try: result = bibtex_parser.parse(reference).get_entry_list()[-1] except IndexError: #unable to parse result = None return result
def add_entry_by_string(self, bib_string, file_name=None, skip_if_file_exists=True, skip_if_doi_exists=False, parser=None): """ Add a new entry corresponding to a BibTex string. :param bib_string: a string giving the section in a BibTex file that would represent this reference. :param file_name: the name of a local file to include in the reference section. Optional. :param skip_if_file_exists: boolean, default is True, meaning that if a reference pointing to the same local file already exists in the database, this reference will not be added. Intended to make it easy to update a database without worrying about overwriting existing files. :param skip_if_doi_exists: boolean, default is False, but if True, do not add this reference if another reference with the same DOI already exists. Intended to avoid adding duplicate files. :param parser: An instance of bibtexparser.bparser.BibTextParser customized to parse the new string. The default parser is set with: * ignore_nonstandard_types = False * parser.homogenise_fields = True * parser.customization = lambda entry: self.format_entry(entry) thus, the custom parsing uses the format_entry method of this class with the instance of the class at the time this method was called. :return: none, adds entry in place. """ if skip_if_file_exists and file_name is not None: if file_name in self.files: root_logger.info( 'Not adding {}, entry for that file already in .bib file'. format(file_name)) return # To ensure we get a properly formatted string, we'll parse it into a standard BibDatabase then steal # the entry from it if parser is None: parser = BibTexParser() parser.ignore_nonstandard_types = False parser.homogenise_fields = True # Create a lambda function that knows about the current state of the database parser.customization = lambda entry: self.format_entry(entry) tmpdat = parser.parse(bib_string) if skip_if_doi_exists and 'doi' in tmpdat.entries[ 0] and tmpdat.entries[0]['doi'] in self.dois: root_logger.info( 'Not adding {}, entry with DOI "{}" already in bib file'. format(file_name, tmpdat.entries[0]['doi'])) return if file_name is not None: tmpdat.entries[0]['file'] = file_name # We shouldn't need to do anything else. The other means of access entries (e.g. the dict) seem to be properties # created on the fly from the entries list self.entries.append(tmpdat.entries[0])
def test_does_not_fail_on_non_bibtex_with_partial(self): bibraw = '''@misc{this looks, like = a = bibtex file but , is not a real one! ''' parser = BibTexParser() bib = parser.parse(bibraw, partial=False) self.assertEqual(bib.entries, []) self.assertEqual(bib.preambles, []) self.assertEqual(bib.strings, {}) self.assertEqual(bib.comments, [ '@misc{this looks,\n' ' like = a = bibtex file but\n' ' , is not a real one!' ])
def post(self, request, *args, **kwargs): project_obj = get_object_or_404(Project, pk=self.kwargs.get('project_pk')) publications_do_export = self.get_publications_to_export(project_obj) context = {} formset = formset_factory(PublicationExportForm, max_num=len(publications_do_export)) formset = formset(request.POST, initial=publications_do_export, prefix='publicationform') publications_deleted_count = 0 bib_text = '' if formset.is_valid(): for form in formset: publication_form_data = form.cleaned_data if publication_form_data['selected']: publication_obj = Publication.objects.get( project=project_obj, title=publication_form_data.get('title'), year=publication_form_data.get('year'), unique_id=publication_form_data.get('unique_id'), ) print("id is" + publication_obj.display_uid()) temp_id = publication_obj.display_uid() status, bib_str = crossref.get_bib( publication_obj.display_uid()) bp = BibTexParser(interpolate_strings=False) bib_database = bp.parse(bib_str) bib_text += bib_str response = HttpResponse(content_type='text/plain') response['Content-Disposition'] = 'attachment; filename=refs.bib' buffer = io.StringIO() buffer.write(bib_text) output = buffer.getvalue() buffer.close() response.write(output) return response else: for error in formset.errors: messages.error(request, error) return HttpResponseRedirect( reverse('project-detail', kwargs={'pk': project_obj.pk}))
def new_publication(): form = PublicationForm() bibForm = BibtexPublicationForm() if form.validate_on_submit(): publication = Publication(title=form.title.data, doi=form.doi.data, year=form.year.data, journal=form.journal.data, type=form.type.data, status=form.status.data, primary_user=current_user.id) db.session.add(publication) db.session.commit() return redirect(url_for("view_publication", id=publication.id)) elif bibForm.validate_on_submit(): bp = BibTexParser(interpolate_strings=False) bib_database = bp.parse(bibForm.parse.data) bib_database.entries[0] def value(key): return bib_database.entries[0][key] keys = ("author", "title", "doi", "year", "ID", "journal", "status") if set(keys) <= set(bib_database.entries[0]): publication = Publication(title=value("title"), doi=value("doi"), year=value("year"), journal=value("journal"), type=value("ENTRYTYPE"), status=value("status"), primary_user=current_user.id) db.session.add(publication) db.session.commit() return redirect(url_for("view_publication", id=publication.id)) return render_template("publication.html", form=form, bibForm=bibForm)
class LazyBibDatabase(BibDatabase): """Lazy-loading subclass of bibtexparser.bibdatabase.BibDatabase. To improve performance on large files, this class indexes (:func:`_index`) the start and end locations of each entry in the file, but does not read or parse them. When :func:`get_entry` is called, only the single entry is read and parsed. This functionality should be pushed upstream to bibtexparser. """ entry_re = re.compile(rb"^\s*@([^{]*){([^,}]*)", re.MULTILINE) def __init__(self, path, config): super(LazyBibDatabase, self).__init__() # Database file self._file = open(path, "rb") # Keywords index self.keywords = set() # Index the database self._all_loaded = False self._index() # Set up a parser to be used by _read_entry self._parser = BibTexParser( homogenize_fields=False, ignore_nonstandard_types=False, customization=lambda r: BibItem(r, self.keywords.update, config), ) def _index(self): """Index the database.""" # Use a mmap to avoid loading the entire file into memory m = mmap.mmap(self._file.fileno(), 0, access=mmap.ACCESS_READ) # Iterate through matches of the regular expression for entries breaks = [] for match in self.entry_re.finditer(m): # Store (start, entry type, entry ID) info = [match.start()] + list(map(bytes.decode, match.groups())) if info[2] == "": info[2] = "<entry without ID at {0}>".format(*info) breaks.append(tuple(info)) del m # Convert the breaks to an index self._entries_index = {} for idx, (start, entrytype, id) in enumerate(breaks): if entrytype in ("comment"): # Don't index comments continue try: # Current entry extends to the start of the next self._entries_index[id] = (start, breaks[idx + 1][0] - start) except IndexError: # Last entry in file, length of -1 will make read() gobble self._entries_index[id] = (start, -1) def _read_entry(self, key): """Actually read and parse the entry with ID *key*.""" # Locate the start of the entry start, length = self._entries_index[key] self._file.seek(start) # Parse the entry self._parser.parse(self._file.read(length)) # bibtexparser.bparser.BibTexParser uses an internal BibDatabase that # is not emptied for successive calls to parse(). Empty it. entry = self._parser.bib_database.entries.pop() assert len(self._parser.bib_database.entries) == 0 # Store for later access self._entries_dict[entry["ID"]] = entry return entry def iter_entries(self, progress=False): if progress: return tqdm(self._generate_entries(), total=len(self._entries_index), leave=False) else: return iter(self._generate_entries()) def _generate_entries(self): if self._all_loaded: yield from self._entries_dict.values() else: for key in self.iter_entry_keys(): yield self.get_entry(key) self._all_loaded = True def get_entry(self, key): """Retrieve the entry with ID *key*.""" try: return self._entries_dict[key] except KeyError: return self._read_entry(key) def iter_entry_keys(self): """Return an iterator over entry IDs. This is much faster than in BibDatabase, because the entries are not fully parsed. Use :func:`get_entry` to retrieve the actual entry. """ return self._entries_index.keys()
def test_parse_bom_bytes(self): parser = BibTexParser() with open(self.input_bom_file_path, 'rb') as bibtex_file: bibtex_str = bibtex_file.read() bibtex_database = parser.parse(bibtex_str) self.assertEqual(bibtex_database.entries, self.entries_expected)
def pull(self): bibtex_parser = BibTexParser(common_strings=True, interpolate_strings=False) self.input_file.seek(0) return bibtex_parser.parse(self.input_file.read())
def import_command(ctx, paths): """Read new entries into the database. PATHS may be zero or more .bib files or directories containing .bib files to import. Configuration file keys \b import: path: a default path to check for .bib files to import, if no PATHS are given. """ # If no files if len(paths) == 0: # Directory from which to import entries paths = [ctx.cmd_config("import").get("path", ".")] paths = [ os.path.join(p, "*.bib") if os.path.isdir(p) else p for p in paths ] # A parser for reading entries parser = BibTexParser() parser.homogenise_fields = False parser.customization = _add_clean # Iterate over files in the add_dir for fn in chain(*map(iglob, paths)): os.system("clear") print("Importing", fn, end=":\n\n") # Read and parse the file with open(fn, "r") as f: s = f.read() print(s, end="\n\n") try: e = parser.parse(clean_str(s)).entries[-1] except ParseException: print(clean_str(s)) raise abstract = e.pop("abstract", None) print("Parsed entry:", to_string(e), sep="\n\n") if abstract is not None: print("Abstract:", abstract, sep="\n\n") # Ask user for a key while True: key = input_with_prefill( "\nEnter key for imported entry " "([] Skip, [D]elete without importing, [Q]uit): ", guess_key(e), ) try: ctx.db.get_entry(key) print("Key already exists.") except KeyError: break if key == "": continue elif key.lower() == "d": os.remove(fn) continue elif key.lower() == "q": break else: # Change the entry key e["ID"] = key # Add a custom field with the current date e["entrydate"] = datetime.now().isoformat(timespec="minutes") # Select a full text file to go with the entry fn_local = _select_file(e["ID"], ctx.cmd_config("import").get("path", ".")) if fn_local: e["localfile"] = os.path.basename(fn_local) # Append the entry to the database with open(ctx.config["database"], "a") as f_db: f_db.write("\n") f_db.write(to_string(e)) # Write the abstract if abstract: fn_abstract = ctx.config["path"] / "abstracts" / ("%s.tex" % key) with open(fn_abstract, "x") as f_abstract: f_abstract.write(abstract) # Move the full text file if fn_local: os.system('mv -n "{}" "{}"'.format( fn_local, ctx.config["path"] / e["localfile"])) # Remove the imported entry file remove = input("\nRemove imported file %s ([Y]es, [enter] to " "keep)? " % fn) if remove.lower() == "y": os.remove(fn)