def test_convert_to_unicode(self): record = {'toto': '{\`a} \`{a}'} result = convert_to_unicode(record) expected = {'toto': 'à à'} self.assertEqual(result, expected) record = {'toto': '{\\"u} \\"{u}'} result = convert_to_unicode(record) expected = {'toto': 'ü ü'} self.assertEqual(result, expected)
def extract_all_entries(bibfile, unicode_conversion=False): """ Return dict: {citekey: {title, authors, year}} """ entries = defaultdict(lambda: defaultdict(str)) if not os.path.exists(bibfile): print('bibfile not found:', bibfile) return {} with open(bibfile, mode='r', encoding='utf-8') as f: for line in f: line = line.strip() if line.endswith(','): line = line[:-1] match = Autobib.citekey_matcher.findall(line) if match: current_citekey = match[0] continue match = Autobib.author_matcher.findall(line) if match: authors = match[0] if unicode_conversion: authors = convert_to_unicode({'author': authors})['author'] authors = Autobib.parse_authors(authors) entries[current_citekey]['authors'] = authors continue match = Autobib.editor_matcher.findall(line) if match: editors = match[0] if unicode_conversion: editors = convert_to_unicode({'editor': editors})['editor'] editors = Autobib.parse_authors(editors) entries[current_citekey]['editors'] = authors continue match = Autobib.title_matcher.findall(line) if match: title = match[0] if unicode_conversion: title = convert_to_unicode({'title': title})['title'] title = Autobib.remove_latex_commands(title) entries[current_citekey]['title'] = title continue match = Autobib.year_matcher.findall(line) if match: year = match[0] year = Autobib.remove_latex_commands(year) entries[current_citekey]['year'] = year continue return entries
def test_convert_to_unicode(self): record = {'toto': '{\`a} \`{a}'} result = convert_to_unicode(record) expected = {'toto': 'à à'} self.assertEqual(result, expected) record = {'toto': '{\\"u} \\"{u}'} result = convert_to_unicode(record) expected = {'toto': 'ü ü'} self.assertEqual(result, expected) # From issue 121 record = {'title': '{Two Gedenk\\"uberlieferung der Angelsachsen}'} result = convert_to_unicode(record) expected = {'title': '{Two Gedenküberlieferung der Angelsachsen}'} self.assertEqual(result, expected)
def maybe_unicode(record): try: record = c.convert_to_unicode(record) except TypeError as e: logging.warning("Unicode Error on: {}".format(record['ID'])) record['unicode_error'] = str(e) record['error'].append("unicode_error")
def __call__(self, rec): from bibtexparser.customization import author, type, convert_to_unicode rec = type(convert_to_unicode(rec)) for key in rec.keys(): val = rec.get(key) val = val.replace("{\\nbsp}", nbsp).replace("``", u"“").replace("''", u"”") rec[key] = val if "journal" in rec: rec["journal"] = _bib_journals.get(rec["journal"].lower(), rec["journal"]) rec = author(rec) if "author" in rec: newauths = [] for idx, text in enumerate(rec["author"]): text = text.replace("{", "").replace("}", "").replace("~", " ") surname, rest = text.split(",", 1) if surname.lower() == self.mylsurname: rec["wl_mypos"] = unicode(idx + 1) newauths.append(rest + " " + surname.replace(" ", "_")) rec["author"] = "; ".join(newauths) rec["wl_cite"] = _bib_cite(rec) return rec
def __call__ (self, rec): from bibtexparser.customization import author, type, convert_to_unicode rec = type (convert_to_unicode (rec)) for key in rec.keys (): val = rec.get (key) val = (val .replace ('{\\nbsp}', nbsp) .replace ('``', u'“') .replace ("''", u'”')) rec[key] = val if 'journal' in rec: rec['journal'] = _bib_journals.get (rec['journal'].lower (), rec['journal']) rec = author (rec) if 'author' in rec: newauths = [] for idx, text in enumerate (rec['author']): text = text.replace ('{', '').replace ('}', '').replace ('~', ' ') surname, rest = text.split (',', 1) if surname.lower () == self.mylsurname: rec['wl_mypos'] = unicode (idx + 1) newauths.append (rest + ' ' + surname.replace (' ', '_')) rec['author'] = '; '.join (newauths) rec['wl_cite'] = _bib_cite (rec) return rec
def customizations(entry): entry = clear_empty(entry) entry = author(entry) entry = page_endash(entry) entry = convert_to_unicode(entry) entry = clean_latex(entry) return entry
def _customizations_unicode(record): """ This function curstumizes record for raw style. See bibtexparser lib for more info. """ record = customization.page_double_hyphen(record) record = customization.convert_to_unicode(record) record = customization.author(record) return record
def customizations(record): """Use some functions delivered by the library :param record: a record :returns: -- customized record """ record = convert_to_unicode(record) return record
def _customizations(record): """ Bibtexparser customizations that are applied to every entry found in the .bib files """ record = convert_to_unicode(record) record = type(record) # make the entry types lower-case record = author(record) # split the authors into a list record = editor(record) # split the editors into a list return record
def test_convert_to_unicode(self): record = {'toto': '{\`a} \`{a}'} result = convert_to_unicode(record) expected = {'toto': 'à à'} self.assertEqual(result, expected) record = {'toto': '{\\"u} \\"{u}'} result = convert_to_unicode(record) expected = {'toto': 'ü ü'} self.assertEqual(result, expected) # From issue 121 record = {'title': '{Two Gedenk\\"uberlieferung der Angelsachsen}'} result = convert_to_unicode(record) expected = {'title': 'Two Gedenküberlieferung der Angelsachsen'} self.assertEqual(result, expected) # From issue 161 record = {'title': r"p\^{a}t\'{e}"} result = convert_to_unicode(record) expected = {'title': "pâté"} self.assertEqual(result, expected) record = {'title': r"\^{i}le"} result = convert_to_unicode(record) expected = {'title': "île"} self.assertEqual(result, expected) record = {'title': r"\texttimes{}{\texttimes}\texttimes"} result = convert_to_unicode(record) expected = {'title': "×××"} self.assertEqual(result, expected)
def __init__(self, data_: dict): data_ = bib_custom.convert_to_unicode(data_) for k, v in data_.items(): if isinstance(v, str): data_[k] = v.replace('<br>', '').strip() data_ = handle_authors(data_) data_ = handle_pages(data_) data_ = bib_custom.type(data_) data_ = bib_custom.doi(data_) super().__init__(self) self.data = data_ for field in set.union(self.required_fields, self.optional_fields): self[field] = self.data.get(field, None)
def customizations(record): """Use some functions delivered by the library.""" # record = type(record) record = author(record) # record = editor(record) # record = journal(record) # record = keyword(record) # record = link(record) # record = page_double_hyphen(record) # record = doi(record) record = convert_to_unicode(record) record['annote'] = strip_chars(record['annote']) return record
def test_convert_to_unicode(self): record = {'toto': '{\`a} \`{a}'} result = convert_to_unicode(record) expected = {'toto': 'à à'} self.assertEqual(result, expected) record = {'toto': '{\\"u} \\"{u}'} result = convert_to_unicode(record) expected = {'toto': 'ü ü'} self.assertEqual(result, expected) # From issue 121 record = {'title': '{Two Gedenk\\"uberlieferung der Angelsachsen}'} result = convert_to_unicode(record) expected = {'title': 'Two Gedenküberlieferung der Angelsachsen'} self.assertEqual(result, expected) # From issue 161 record = {'title': r"p\^{a}t\'{e}"} result = convert_to_unicode(record) expected = {'title': "pâté"} self.assertEqual(result, expected) record = {'title': r"\^{i}le"} result = convert_to_unicode(record) expected = {'title': "île"} self.assertEqual(result, expected)
def bibtex_cleaner(entry): entry = clean.keyword(entry) if entry.get('keyword'): entry['keyword'] = ','.join(entry['keyword']).lower() # print(entry.get('keyword')) entry = clean.page_double_hyphen(entry) entry = clean.convert_to_unicode(entry) # entry = clean.add_plaintext_fields(entry) entry = clean.link(entry) entry = clean.doi(entry) # print(entry.get('keyword')) return entry
def td_biblio_customization(record): """ Customize BibTex records parsing """ # Convert crapy things to latex record = to_latex(record) # and then to unicode record = bp_customization.convert_to_unicode(record) record = bp_customization.type(record) record = bp_customization.author(record) record = bp_customization.editor(record) record = bp_customization.page_double_hyphen(record) return record
def _parse_bib_entry(entry): """ Customization function for bibtexparser. :param entry: bibtex record to modify :return bibtex record """ if CONVERT_TO_UNICODE: entry = bib_custom.convert_to_unicode(entry) entry = bib_custom.author(entry) entry = bib_custom.editor(entry) entry = bib_custom.keyword(entry) entry = bib_custom.page_double_hyphen(entry) return entry
def customizations(record): """Use some functions delivered by the library :param record: a record :returns: -- customized record """ record = convert_to_unicode(record) # record = type(record) record = author(record) record = editor(record) # record = journal(record) # Do not use! # record = keyword(record) # record = link(record) record = page_double_hyphen(record) # record = doi(record) return record
def customize(record): """ Customise bibtexparser records """ record = customization.convert_to_unicode(record) for field_name in ['author', 'title', 'journal']: try: field = record[field_name] record[field_name] = tex_to_html(field) except KeyError: pass # Splits author into a list of authors: record = customization.author(record) # Now convert each author into a tuple of last, first name record = split_authors(record) record = pages_endash(record) return record
def customizations(record): """Use some functions delivered by the library :param record: a record :returns: -- customized record """ record = bc.convert_to_unicode(record) record = bc.type(record) # lowercase record = bc.author(record) record = bc.editor(record) record = bc.journal(record) record = bc.keyword(record) record = bc.link(record) record = bc.page_double_hyphen(record) record = bc.doi(record) return record
def process(self, record): record = convert_to_unicode(record) data_dict = { 'id': record['ID'], 'title': record['title'].strip('{}'), 'name': munge_title_to_name(record['ID'] + record['title']), 'notes': record['abstract'], 'harvest_source': 'MENDELEY', 'creator': record['author'].replace(',', '').split(' and '), 'tag_string': ','.join(munge_tag(tag) for tag in record['keywords'].split(',')), 'owner_org': tk.config.get('ckanext.ingestor.config.mendeley_bib.owner_org', 'iaea'), 'type': 'publications' } identifiers = [] if 'doi' in record: identifiers.append('doi:' + record['doi']) if 'isbn' in record: identifiers.append('isbn:' + record['isbn']) if 'pmid' in record: identifiers.append('pmid:' + record['pmid']) data_dict['identifier'] = identifiers if 'editor' in record: data_dict['contributor'] = [record['editor']] if 'publisher' in record: data_dict['publisher'] = [record['publisher']] if 'language' in record: data_dict['language'] = [record['language']] data_dict['source'] = record.get('url') user = tk.get_action('get_site_user')({'ignore_auth': True}) existing = model.Package.get(data_dict['id']) action = tk.get_action( 'package_update' if existing else 'package_create') action({'ignore_auth': True, 'user': user['name']}, data_dict)
def btex_custom(self, record): r = convert_to_unicode(record) if "pages" in record: # fix -- -> – if "-" in record["pages"]: p = [i.strip().strip('-') for i in record["pages"].split("-")] record["pages"] = p[0] + u'–' + p[-1] authors = r.get('author') if not authors: authors = r.get('editor', 'Anon.') _authors = getnames(authors.split(" and ")) _and_surnames = self.and_authors( [s.split(",")[0].strip() for s in _authors]) r['author'] = self.and_authors(_authors) r['surnames'] = _and_surnames r['author_year'] = _and_surnames + u" " + r.get('year', '') r['unique_suffix'] = self.unique_suffix(r['author_year']) r['author_year'] += r['unique_suffix'] r['title'] = r['title'] # .replace("{", "").replace("}","") return r
def btex_custom(self, record): r = convert_to_unicode(record) if "pages" in record: # fix -- -> – if "-" in record["pages"]: p = [i.strip().strip('-') for i in record["pages"].split("-")] record["pages"] = p[0] + u'–' + p[-1] authors = r.get('author') if not authors: authors = r.get('editor', 'Anon.') _authors = getnames(authors.split(" and ")) _and_surnames = self.and_authors( [s.split(",")[0].strip() for s in _authors]) r['author'] = self.and_authors(_authors) r['surnames'] = _and_surnames r['author_year'] = _and_surnames + u" " + r['year'] r['unique_suffix'] = self.unique_suffix(r['author_year']) r['author_year'] += r['unique_suffix'] r['title'] = r['title'] # .replace("{", "").replace("}","") return r
def customizations(record): ''' Use some customizations for bibtexparser Args: record: A record Returns: record: Customized record ''' record = convert_to_unicode(record) # record = type(record) record = author(record) record = editor(record) # record = journal(record) # Do not use! # record = keyword(record) # record = link(record) record = page_double_hyphen(record) # record = doi(record) return record
def custom_callback(record): get_ADS_jrnls() # Convert to unicode record = cus.convert_to_unicode(record) # Convert jounal macro to real name if 'journal' in record and '\\' in record['journal']: record["journal"] = macro['j_name'][ macro['macro'] == record["journal"].strip('\\')].values[0] # Convert author strings if 'author' in record: rep = {"{": "", "}": "", "~": " "} rep = dict((re.escape(k), v) for k, v in rep.items()) pattern = re.compile("|".join(rep.keys())) record['author'] = pattern.sub(lambda m: rep[re.escape(m.group(0))], record['author']) return record
def bib_customizations(record): def truncate_title(record): title = record['title'] if 'title' in record else '' title = smart_truncate(title) record['title'] = title return record def et_al(record): author = record['author'] if 'author' in record else [] author = [a.replace(', ', ' ').replace(',', ' ') for a in author] if len(author) == 0: record['author'] = '' elif len(author) == 1: record['author'] = author[0] else: record['author'] = author[0] + ' et al.' return record record = convert_to_unicode(record) record = author(record) record = et_al(record) record = truncate_title(record) return record
def main(): parser = ArgumentParser() parser.add_argument("target", help="The bib file to abbreviate.") parser.add_argument( "-o", "--output", help="The output file name. If missing, output will be sent to stdout.") parser.add_argument( "-r", "--reverse", help="Reverse the process and unabbreviate journal names.", action="store_true") parser.add_argument( "-a", "--abbreviations", help="Path to a file of abbreviations in the form (one per line): Journal of Biological Science = J. Sci. Biol.", default=os.path.join(determine_path(), "journal_files", "journal_abbreviations_general.txt", ) ) parser.add_argument("-v", "--verbose", action="store_true") args = parser.parse_args() level = logging.WARNING if not args.verbose else logging.INFO logger.setLevel(level) input = open(args.target, "r") output = open(args.output, "w") if args.output else sys.stdout refs_bp = BibTexParser(input.read()) refs = refs_bp.get_entry_dict() abbrevs = load_abbrevs(args.abbreviations, reverse=args.reverse) # Assume that if it has a journal key, then it needs abbreviating. I'm doing this # instead of testing for type==article in case I've forgotten about a case where # type != article but there's a journal field. # Also, journal names with one word ('Nature') don't require # abbreviation. refs = {key: ref for key, ref in refs.items() if 'journal' in ref} refs = {key: ref for key, ref in refs.items() if len(ref['journal'].split(' ')) > 1} for ref in refs: journal = convert_to_unicode(copy(refs[ref]))['journal'].lower() # Handle any difficult characters. TODO: check that this list # is complete. journal_clean = re.sub('[{}]', '', journal) try: refs[ref]['journal'] = abbrevs[journal_clean] logger.info('%s replaced with %s for key %s' % (journal, abbrevs[journal_clean], ref)) except KeyError: logger.error('%s not found in abbreviations!' % (journal_clean)) output_bib = to_bibtex(refs_bp) output.write(output_bib)
def test_convert_to_unicode3(self): record = {'toto': "\\c \\'"} result = convert_to_unicode(record) expected = {'toto': " \u0327\u0301"} self.assertEqual(result, expected)
def custom(record): try: record = c.convert_to_unicode(record) except TypeError as e: logging.warning("Unicode Error on: {}".format(record['ID'])) record['error'] = 'unicode' try: #add md5 of associated files files = [add_slash_if_necessary(y) for x in record['file'].split(';') for y in x.split(':') if bool(y.strip()) and y.strip().lower() != 'pdf'] file_set = set(files) if not 'hashes' in record: hashes = [file_to_hash(x) for x in file_set] record['hashes'] = ";".join(hashes) #regularize format of files list record['file'] = ";".join(file_set) except Exception as e: logging.warning("File Error: {} : {}".format(record['ID'], e.args[0])) record['error'] = 'file' #todo: if file is not in the library common prefix, move it there #look for year, then first surname, then copy in, making dir if necessary if file_set: for x in file_set: try: current_path = realpath(x) common = commonpath([current_path, args.library]) if common != args.library: logging.info("Found file outside library: {}".format(current_path)) logging.info("Common: {}".format(common)) #get the author and year year = record['year'] authors = c.getnames([i.strip() for i in record["author"].replace('\n', ' ').split(" and ")]) authors_split = [c.splitname(a) for a in authors] author_surnames = [a['last'][0] for a in authors_split] new_path = join(args.library, year, ", ".join(author_surnames)) logging.info("New Path: {}".format(new_path)) #create directory if necessary #copy file full_new_path = join(new_path, split(current_path)[1]) logging.info("Copying file") logging.info("From: {}".format(current_path)) logging.info("To: {}".format(full_new_path)) response = input("Enter to confirm: ") if response == "": logging.info("Proceeding") if not exists(new_path): mkdir(new_path) if exists(full_new_path): raise Exception("File already exists") copyfile(x, full_new_path) file_set.remove(x) file_set.add(full_new_path) record['file'] = ";".join(file_set) except Exception as e: logging.info("Issue copying file for: {}".format(x)) logging.info(e) record['error'] = 'file_copy' #regularize keywords try: keywords = set() if 'tags' not in record: if 'keywords' in record: keywords.update([x.strip() for x in record['keywords'].split(',')]) del record['keywords'] if 'mendeley-tags' in record: keywords.update([x.strip() for x in record['mendeley-tags'].split(',')]) del record['mendeley-tags'] record['tags'] = ",".join(keywords) except Error as e: logging.warning("Tag Error: {}".format(record['ID'])) record['error'] = 'tag' # record = c.type(record) # record = c.author(record) # record = c.editor(record) # record = c.journal(record) # record = c.keyword(record) # record = c.link(record) # record = c.doi(record) # record['p_authors'] = [] # if 'author' in record: # record['p_authors'] = [c.splitname(x, False) for x in record['author']] return record
def customize(record): record = customization.convert_to_unicode(record) record = customization.author(record) return record
def get_unicode_bibdata(self): """Converts latex in bibdata fields to unicode.""" return convert_to_unicode(self.bibdata)
def _mixed_customization(record): record = homogeneize_latex_encoding(record) record = convert_to_unicode(record) record = bc.author(record) return record
def customize(record): def fix_newlines(record): for key, value in record.items(): if key in 'url': record[key] = value.replace("\n", "") if key not in ('author', 'url', 'editor'): value = value.replace("\n", " ") record[key] = value.replace(r"\par", "\n\n") return record record = fix_newlines(record) record = customization.type(record) record = customization.convert_to_unicode(record) def split_author(record): if 'author' in record: authors = [] for author in record['author']: lastname, firstname = author.split(", ") authors.append(Author(firstname, lastname)) record['author'] = authors return record def parse_kind(kind, record): if kind in record and record[kind]: remove_translate_table = str.maketrans('', '', ', .') # record_id determines the name of the PDF # it's been hard-coded in the view: # layouts/partials/publications_icons.html # ----> this might want to be refactored record_id = record[kind].translate(remove_translate_table) record[kind] = {'name': record[kind], 'ID': record_id} return record record = customization.author(record) record = customization.journal(record) record = customization.keyword(record) record = customization.link(record) record = customization.doi(record) record = customization.page_double_hyphen(record) record = split_author(record) for kind in ('booktitle', 'series'): record = parse_kind(kind, record) def pdf_is_there(record): #print(record["ID"]) filename = record["ID"] + ".pdf" path_to_file = os.path.join(LOCAL_PDF_VAULT, filename) print(path_to_file) if os.path.isfile(path_to_file): print("\t PDF found!") else: print("\t NO PDF!!!") record["paper"] = "no" return record if ("paper" in record.keys() and record["paper"] == "yes"): #print(record) return pdf_is_there(record) return record
def customizations(record): record = convert_to_unicode(record) pdf(record) return record