def main(): p = argparse.ArgumentParser() p.add_argument("input_file", help='The original .bib file you want to sanitize.') p.add_argument("config", help='The config file in JSON format.') p.add_argument("output_file", help='Name of the new sanitized file.') args = p.parse_args() # parse original bibfile with open(args.input_file) as bibFile: bibDB = bibtexparser.load(bibFile) # parse config file with open(args.config) as configFile: confDB = json.load(configFile) checkDuplicates(args.input_file) bibDB = checkMandatoryFieldsAndKeywords(bibDB, confDB['read_config']) checkTags(bibDB, confDB['read_config']['tag_regex']) # write results writer = BibTexWriter() writer.contents = ['entries'] # use ordering that is defined in "sort_order" in the config.json file writer.order_entries_by = confDB["write_config"]["sort_order"] with open(args.output_file, 'w') as resFile: resFile.write(bibtexparser.dumps(bibDB, writer))
def getcitation(): articlesparser = BibTexParser(common_strings=False) articlesparser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/articles.bib', encoding='utf8') as articlesfile: articles_database = bibtexparser.load(articlesfile, articlesparser) articleentries = articles_database.entries import random samplelist = random.sample(range(len(articleentries)), 20) print(samplelist) for i in samplelist: print("---------------------------") print("Entry number: " + str(i)) title = articleentries[i]['title'] clusterid = articleentries[i]['clusterid'] print("Title: " + title) print("Cluster ID: " + clusterid) if not clusterid == "unknown": print(str(i)) try: citations = os.popen( '''/usr/bin/python3 /home/limingtao/ircre-bibtex/ircreupdate/scholarpy/scholar.py -c 1 -C ''' + clusterid + ''' |grep -v list |grep Citations''').read().strip().split()[ -1] except: citations = "unknown" else: citations = "unknown" print("new Citations: " + citations) if 'cited' in articleentries[i]: oldcitednumber = int(articleentries[i]['cited']) else: oldcitednumber = 0 print("Old Cited Number: " + str(oldcitednumber)) if not citations == "unknown": citednumber = int(citations) if citednumber > oldcitednumber and ((citednumber - oldcitednumber) < 8): articleentries[i]['cited'] = str(citednumber) writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('order',) with open('/home/limingtao/ircre-bibtex/ircreupdate/cited-add-articles.bib', 'w', encoding='utf8') as newarticlefile: bibtexparser.dump(articles_database, newarticlefile, writer=writer) os.popen("cp /home/limingtao/ircre-bibtex/ircreupdate/cited-add-articles.bib tempcited-add-articles.bib") os.popen("cp /home/limingtao/ircre-bibtex/ircreupdate/articles.bib /home/limingtao/ircre-bibtex/ircreupdate/oldarticles.bib") with open('/home/limingtao/ircre-bibtex/ircreupdate/articles.bib', 'w', encoding='utf8') as newarticlefile: bibtexparser.dump(articles_database, newarticlefile, writer=writer) return 0
def write_bib(bib_database, filen="dl4m.bib"): """Description of write_bib Write the items stored in bib_database into filen """ writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('noneyear', "author") with open(filen, "w", encoding="utf-8") as bibfile: bibfile.write(writer.write(bib_database))
def _writer(): ''' Return a configured bibtex writer. ''' writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('ID',) writer.display_order = ['title', 'author', 'editor'] return writer
def dumps(bibman): db = bibtexparser.bparser.BibDatabase() db._entries_dict = bibman.cleaned db.entries = list(bibman.cleaned.values()) writer = BibTexWriter() # writer.order_entries_by = ('type', 'author', 'year') writer.order_entries_by = None writer.contents = ['comments', 'entries'] writer.indent = ' ' new_text = bibtexparser.dumps(db, writer) return new_text
def bibtexclassify(): parser = BibTexParser(common_strings=False) parser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/ircre.bib', encoding='utf8') as bibtexfile: ircrebib_database = bibtexparser.load(bibtexfile, parser) allentries = ircrebib_database.entries.copy() # ---------------------------------------- # get all articles # ----------------------------------------- article_entries = [] for i in range(len(allentries)): if allentries[i]['ENTRYTYPE'] == 'article': article_entries.append(allentries[i].copy()) article_database = BibDatabase() article_database.entries = article_entries writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('order',) with open('/home/limingtao/ircre-bibtex/ircreupdate/articles.bib', 'w', encoding='utf8') as article_file: bibtexparser.dump(article_database, article_file, writer=writer) otherentries= [] for i in range(len(allentries)): if allentries[i]['ENTRYTYPE'] == 'inbook' or allentries[i]['ENTRYTYPE'] == 'inproceedings' or allentries[i]['ENTRYTYPE'] == 'incollection': otherentries.append(allentries[i].copy()) other_database = BibDatabase() other_database.entries = otherentries writer2 = BibTexWriter() writer2.indent = ' ' writer2.order_entries_by = ('order',) with open('/home/limingtao/ircre-bibtex/ircreupdate/others.bib', 'w', encoding='utf8') as others_file: bibtexparser.dump(other_database, others_file, writer=writer2) return 0
def write_bibtex_file(filename, db): """ Write BiBTeX file with content from db """ writer = BibTexWriter() writer.order_entries_by = ('year', 'ID') with open(filename, 'wb') as output_file: bibtex_str = bibtexparser.dumps(db, writer=writer) output_file.write(bibtex_str.encode('utf8')) print("Wrote %i records into filename '%s'" % (len(db.entries), filename))
def write_output(bib_entries, config): db = BibDatabase() # clean up the bib_entries for bib_entry in bib_entries: bib_entry.pop("sig1") db.entries = bib_entries writer = BibTexWriter() writer.indent = ' ' * config['space'] writer.order_entries_by = ('ENTRYTYPE', 'title') if config['sort'] == 'ne': writer.order_entries_by = ('title', 'ENTRYTYPE') if config['inplace']: with open(config['input_path'], 'w') as output_file: output_file.write(writer.write(db)) else: print(''' ########################################## # Formatted file preview ########################################## ''') print(writer.write(db))
def ircrebibmerge(): articlesparser = BibTexParser(common_strings=False) articlesparser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/sorted-articles.bib', encoding='utf8') as sortedarticle_file: sortedarticle_database = bibtexparser.load(sortedarticle_file, articlesparser) sortedarticles = sortedarticle_database.entries.copy() top15parser = BibTexParser(common_strings=False) top15parser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/top15.bib', encoding='utf8') as top15_file: top15_database = bibtexparser.load(top15_file, top15parser) top15articles = top15_database.entries.copy() othersparser = BibTexParser(common_strings = False) othersparser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/others.bib', encoding='utf8') as others_file: others_database = bibtexparser.load(others_file, othersparser) others = others_database.entries.copy() alldb = BibDatabase() entries = [] for i in range(len(top15articles)): entries.append(top15articles[i].copy()) for i in range(len(sortedarticles)): entries.append(sortedarticles[i].copy()) for i in range(len(others)): entries.append(others[i].copy()) alldb.entries = entries writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = None with open('/home/limingtao/ircre-bibtex/ircreupdate/newircre.bib', 'w', encoding='utf8') as newircrebibfile: bibtexparser.dump(alldb, newircrebibfile, writer=writer) return 0
def test_sort_missing_field(self): bib_database = BibDatabase() bib_database.entries = [{'ID': 'b', 'ENTRYTYPE': 'article', 'year': '2000'}, {'ID': 'c', 'ENTRYTYPE': 'book', 'year': '2010'}, {'ID': 'a', 'ENTRYTYPE': 'book'}] writer = BibTexWriter() writer.order_entries_by = ('year', ) result = bibtexparser.dumps(bib_database, writer) expected = "@book{a\n}\n\n@article{b,\n year = {2000}\n}\n\n@book{c,\n year = {2010}\n}\n\n" self.assertEqual(result, expected)
def parse_csv_file(args): """Parse a CSV file into bibtex format.""" entries = [] filter_rows = get_filter_rows(args) with open(args.csv_file) as csv_file: reader = csv.DictReader(csv_file) for i, row in enumerate(reader, 1): if i < args.starting_row: continue row = fix_columns_headers(row) key = entry_key(row) if filter_rows.get(key): continue if row['Item Type'] not in TYPES: print('ItemType not found: "{}"'.format(row['Item Type'])) sys.exit() row_type = TYPES[row['Item Type']] entry = add_entry(row, row_type['type'], row_type['remap']) entries.append(entry) if args.randomize: entries = np.random.permutation(entries) # pylint: disable=no-member for i, beg in enumerate(range(0, len(entries), args.max_entries), 1): file_name = args.bibtex_file if args.max_entries != MAX_ENTRIES: root, ext = os.path.splitext(file_name) file_name = '{}{}{}'.format(root, i, ext) print(i, len(entries[beg:beg + args.max_entries])) bibtex_db = BibDatabase() bibtex_db.entries = entries[beg:beg + args.max_entries] writer = BibTexWriter() writer.order_entries_by = None with open(file_name, 'w') as bibtex_file: bibtex_file.write(writer.write(bibtex_db))
def _cleanupBibTex(self, count): """ Clean up bibtex and ensure uniform look. """ import bibtexparser from bibtexparser.bparser import BibTexParser parser = BibTexParser() parser.customization = homogeneize_latex_encoding bib = bibtexparser.loads(self.refs, parser=parser) # save results from bibtexparser.bwriter import BibTexWriter writer = BibTexWriter() writer.contents = ['entries'] writer.indent = ' ' writer.order_entries_by = ('id') self.number = len(bib.entries) self.refs = bibtexparser.dumps(bib, writer)
def getclusterid(title, author): parser = BibTexParser(common_strings=False) parser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/articles.bib', encoding='utf8') as article_file: article_database = bibtexparser.load(article_file, parser) article_entries = article_database.entries.copy() entries = bib_database.entries print("---------------------------") print("---------------------------") print("---------------------------") print("Total articles number: " + str(len(entries))) print("---------------------------") print("---------------------------") print("---------------------------") writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('order',) for i in range(len(entries)): if entries[i]['clusterid'] == 'unknown': print("---------------------------") print("Entry number: " + str(i)) title = entries[i]['title'] print("Title: " + title) clusterid = '' try: clusterid = os.popen( '''/home/limingtao/ircre-bibtex/ircreupdate/scholarpy/scholar.py -c 1 -t --phrase="''' + title + '''" |grep ID| grep Cluster''').read().strip().split()[ -1] except: clusterid = "unknown" print("new Cluster ID: " + clusterid) entries[i]['clusterid'] = clusterid with open('/home/limingtao/ircre-bibtex/ircreupdate/clusterid-added-ircre.bib', 'w', encoding='utf8') as newbibfile: bibtexparser.dump(bib_database, newbibfile, writer=writer) os.popen("cp /home/limingtao/ircre-bibtex/ircreupdate/clusterid-added-ircre.bib /home/limingtao/ircre-bibtex/ircreupdate/tempclusterid-added-ircre.bib") with open('/home/limingtao/ircre-bibtex/ircreupdate/clusterid-added-ircre.bib', 'w', encoding='utf8') as newbibfile: bibtexparser.dump(bib_database, newbibfile, writer=writer) return 0
def main(): import bibtexparser from bibtexparser.bwriter import BibTexWriter with open('ircre.bib', encoding='utf8') as bibtex_file: bib_database = bibtexparser.load(bibtex_file) entries = bib_database.entries print("---------------------------") print("---------------------------") print("---------------------------") print("Total articles number: " + str(len(entries))) print("---------------------------") print("---------------------------") print("---------------------------") writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('order', ) for i in range(len(entries)): if entries[i]['clusterid'] == 'unknown': print("---------------------------") print("Entry number: " + str(i)) title = entries[i]['title'] print("Title: " + title) clusterid = '' try: clusterid = os.popen( '''./scholarpy/scholar.py -c 1 -t --phrase="''' + title + '''" |grep ID| grep Cluster''').read().strip().split()[-1] except: clusterid = "unknown" print("new Cluster ID: " + clusterid) entries[i]['clusterid'] = clusterid with open('clusterid-added-ircre.bib', 'w', encoding='utf8') as newbibfile: bibtexparser.dump(bib_database, newbibfile, writer=writer) os.popen("cp clusterid-added-ircre.bib tempclusterid-added-ircre.bib") with open('clusterid-added-ircre.bib', 'w', encoding='utf8') as newbibfile: bibtexparser.dump(bib_database, newbibfile, writer=writer) return 0
def proc_bib(input_io: TextIOWrapper, output_io: TextIOWrapper, jdb: JournalDB, silent: bool = False, output_format: str = "bib", abbrev_type="iso4"): if not hasattr(Journal, abbrev_type): raise ValueError(f"Invalid abbreviation type `{abbrev_type}`") bib_db = bibtexparser.load(input_io) for entry in bib_db.entries: journaltitle = entry.get("journaltitle") if journaltitle is None: continue journaltitle = braces_regex.sub("", journaltitle) name_pattern = re.compile(fr"^{re.escape(journaltitle)}(:?.*)$", RegexFlag.IGNORECASE) # TODO: query using lambdas? # TODO: normalize names (just in index?). res = jdb.journals.query_one(Journal.names_key, name_pattern) if res: _, journal = res abbrev = getattr(journal, abbrev_type) if output_format == "bib": entry["journaltitle"] = f"{{{abbrev or journaltitle}}}" elif output_format == "sourcemap": gen_sourcemap_map(journal, journaltitle, abbrev, output_io) abbrev_msg = f"abbreviating to '{abbrev}'" if res else f"no abbreviation found" if not silent: info(f"found journal name '{journaltitle}'; {abbrev_msg}.") if output_format == "bib": bib_writer = BibTexWriter() bib_writer.add_trailing_comma = True bib_writer.display_order = None bib_writer.indent = "\t" bib_writer.order_entries_by = None bibtex_code = bib_writer.write(bib_db) output_io.write(bibtex_code) elif output_format == "sourcemap": pass
def parse_bibtex(file, build_dir): """ Parse merged bibtex file again with customization to clean citations. @type file: .bib file @param file: file to be parsed @type build_dir: file path @param build_dir: where to save """ parser = BibTexParser() parser.customization = customizations years = [] with open(file, 'r') as f: bibtex = bibtexparser.load(f, parser=parser) for i in range(len(bibtex.entries)): for key, value in bibtex.entries[i].items(): if key == 'year': years.append(int(value)) years.sort() years.reverse() years_no_repeat = [] for i in range(len(years)): if years_no_repeat.count(years[i]) == 0: years_no_repeat.append(years[i]) for i in range(len(years_no_repeat)): bibtext = copy.deepcopy(bibtex) array = [] for j in range(len(bibtex.entries)): for key, value in bibtex.entries[j].items(): if key == 'year': if int(value) == years_no_repeat[i]: array.append(bibtex.entries[j]) bibtext.entries = array parse_file = os.path.join(build_dir, str(years_no_repeat[i]) + 'parsed.bib') writer = BibTexWriter() writer.order_entries_by = ('ENTRYTYPE', ) with open(parse_file, 'w') as f: f.write(writer.write(bibtext))
def write_bib(db, order=False): """ Write bibtex string. Args: db (BibDatabase): database object to dump.. order (bool): whether to reorder entries upon writing. Returns: The dumped string. """ # Custom writer writer = BibTexWriter() writer.indent = '\t' writer.order_entries_by = None # Replace month by numeric value for entry in db.entries: if 'month' in entry: for x in [MONTHS, MONTHS_FULL]: if entry['month'] in x: entry['month'] = '{:02d}'.format( x.index(entry['month']) + 1) if order: # Manual sort order_entries_by = ('year', 'author', 'ID') sort_entries(db, order_entries_by) if not config.use_utf8_characters: db.entries = [ nomenclature.encode_ascii_latex(entry) for entry in db.entries ] if config.protect_uppercase: for entry in db.entries: entry["title"] = latex.protect_uppercase(entry["title"]) # Write bib string return writer.write(db)
def write_bibtex(bibtex_entries): bib_database = bibtexparser.bibdatabase.BibDatabase() for e in bibtex_entries: # pop the useless contents e.pop('created_time', None) e.pop('file', None) e.pop('abstract', None) for k in e: if isinstance(e[k], list): e[k] = ' and '.join(e[k]) e[k] = unicode_to_latex(e[k]) bib_database.entries = bibtex_entries writer = BibTexWriter() writer.contents = ['comments', 'entries'] writer.indent = ' ' writer.order_entries_by = ('ENTRYTYPE', 'author', 'year') bibtex_str = bibtexparser.dumps(bib_database, writer) return bibtex_str
def articlessort(): parser = BibTexParser(common_strings=False) parser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/articles.bib', encoding='utf8') as articlesfile: articles_database = bibtexparser.load(articlesfile, parser) articles = articles_database.entries.copy() for i in range(len(articles)): try: articles[i]['sortkey1'] = float(articles[i]['impactfactor']) except: articles[i]['sortkey1'] = float(0) try: articles[i]['sortkey2'] = int(articles[i]['cited']) except: articles[i]['sortkey2'] = int(0) sorted_by_journalif_cited = sorted(articles, key=lambda x: (x['sortkey1'], x['journal'], x['sortkey2'], x['year']), reverse=True) for i in range(len(sorted_by_journalif_cited)): sorted_by_journalif_cited[i]['order'] = str(i).zfill(6) for i in range(len(sorted_by_journalif_cited)): sorted_by_journalif_cited[i].pop('sortkey1') sorted_by_journalif_cited[i].pop('sortkey2') sortedarticlesdatabase = BibDatabase() sortedarticlesdatabase.entries = sorted_by_journalif_cited writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('order',) with open('/home/limingtao/ircre-bibtex/ircreupdate/sorted-articles.bib', 'w', encoding='utf8') as sortedarticlesfile: bibtexparser.dump(sortedarticlesdatabase, sortedarticlesfile, writer=writer) return 0
def getop15articles(): parser = BibTexParser(common_strings=False) parser.ignore_nonstandard_types = False with open('/home/limingtao/ircre-bibtex/ircreupdate/articles.bib', encoding='utf8') as article_file: article_database = bibtexparser.load(article_file, parser) article_entries = article_database.entries.copy() for i in range(len(article_entries)): try: article_entries[i]['sortkey1'] = int(article_entries[i]['cited']) except: article_entries[i]['sortkey1'] = int(0) articles_sorted_by_cited = sorted(article_entries, key=lambda x: (x['sortkey1']), reverse=True) top15articles = [] for i in range(15): top15articles.append(articles_sorted_by_cited[i].copy()) for i in range(len(top15articles)): top15articles[i]['ENTRYTYPE'] = 'toparticle' top15articles[i]['ID'] = top15articles[i]['ID'] + 'a' for i in range(len(top15articles)): top15articles[i].pop('sortkey1') top15_database = BibDatabase() top15_database.entries = top15articles writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = None with open('/home/limingtao/ircre-bibtex/ircreupdate/top15.bib', 'w', encoding='utf8') as top15_file: bibtexparser.dump(top15_database, top15_file, writer=writer) return 0
def post_processing(output_bib_entries, removed_value_names, abbr_dict, sort): bibparser = bibtexparser.bparser.BibTexParser( ignore_nonstandard_types=False) bib_entry_str = "" for entry in output_bib_entries: for line in entry: if is_contain_var(line): continue bib_entry_str += line bib_entry_str += "\n" parsed_entries = bibtexparser.loads(bib_entry_str, bibparser) if len(parsed_entries.entries) < len(output_bib_entries) - 5: print( "Warning: len(parsed_entries.entries) < len(output_bib_entries) -5 -->", len(parsed_entries.entries), len(output_bib_entries)) output_str = "" for entry in output_bib_entries: for line in entry: # if any([re.match(r".*%s.*=.*"%n, line) for n in removed_value_names if len(n)>1]): # continue output_str += line output_str += "\n" return output_str for output_entry in parsed_entries.entries: for remove_name in removed_value_names: if remove_name in output_entry: del output_entry[remove_name] for (short, pattern) in abbr_dict: for place in ["booktitle", "journal"]: if place in output_entry: if re.match(pattern, output_entry[place]): output_entry[place] = short writer = BibTexWriter() if not sort: writer.order_entries_by = None return bibtexparser.dumps(parsed_entries, writer=writer)
def write_bib(db, order=False): """ Write bibtex string. Args: db (BibDatabase): database object to dump.. order (bool): whether to reorder entries upon writing. Returns: The dumped string. """ # Custom writer writer = BibTexWriter() writer.indent = '\t' writer.order_entries_by = None # Replace month by numeric value for entry in db.entries: if 'month' in entry and entry['month'] in MONTHS: entry['month'] = '{:02d}'.format(MONTHS.index(entry['month']) + 1) if order: # Manual sort order_entries_by = ('year', 'author', 'ID') sort_entries(db, order_entries_by) if not config.use_utf8_characters: db.entries = [nomenclature.encode_ascii_latex(entry) for entry in db.entries] if config.protect_uppercase: for entry in db.entries: entry["title"] = latex.protect_uppercase(entry["title"]) # Write bib string return writer.write(db)
def test_sort_type_id(self): writer = BibTexWriter() writer.order_entries_by = ('ENTRYTYPE', 'ID') result = bibtexparser.dumps(self.bib_database, writer) expected = "@article{b\n}\n\n@book{a\n}\n\n@book{c\n}\n\n" self.assertEqual(result, expected)
def test_sort_none(self): writer = BibTexWriter() writer.order_entries_by = None result = bibtexparser.dumps(self.bib_database, writer) expected = "@article{b\n}\n\n@book{c\n}\n\n@book{a\n}\n\n" self.assertEqual(result, expected)
def write_bib(bib_database, filen="dl4m.bib"): writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('noneyear', "author") with open(filen, "w", encoding="utf-8") as bibfile: bibfile.write(writer.write(bib_database))
else: print(f'I detected a duplicate based on the key {id}. ' f'I will augment it with a letter and try again. ' f'Please double-check, if this is correct.. ' f'my duplicate detection algorithm is pretty bad.\n\n') id = id_orig + letters[i] i += 1 if not duplicate: bib = re.sub(r'(@[a-z]*{)(.*),', r'\1' + id + ',', bib) bib_db = bibtexparser.loads(bib) db.entries.extend(bib_db.get_entry_list()) else: bib_db = None if id_list: writer = BibTexWriter() writer.indent = '\t' writer.order_entries_by = ('year', 'ID') writer.add_trailing_comma = True with open('../_bibliography/pint.bib', 'w') as bibfile: bibfile.write(writer.write(db)) for line in fileinput.input('../_bibliography/pint.bib', inplace=True): if '@comment{' in line: line = line.replace('@comment{', '') if re.match(r'%}+', line) is not None: line = re.sub(r'%}+', '%', line) line = line.rstrip('\r\n') print(line)
}, }, 'loggers': { '': { 'handlers': ['default'], 'level': 'ERROR', 'formatter': 'standard', 'propagate': True } } }) writer = BibTexWriter() writer.contents = ['comments', 'entries'] writer.indent = ' ' writer.order_entries_by = ('ENTRYTYPE', 'author', 'year') def create_id(t, year, title): return str(t) + "_" + str(year) + "_" + str(space_to_underscore(title)) def pdf(pdf_files, shared_pdf, bibtex_folder, bibtex_files, gscholar): for pdf in pdf_files: txt = re.sub("\W", " ", gs.convert_pdf_to_txt(pdf)).lower() #Research determined that the cutting of 35 words gives the #highest accuracy words = txt.strip().split()[:35] words = " ".join(words) print words if gscholar == True: bib = load(gs.pdflookup(pdf, all, gs.FORMAT_BIBTEX)[0]) keys = bib.entries[0].keys()
bib_database = bibtexparser.load(bibtex_file, parser=parser) if bib_database : now = datetime.datetime.now() success = "{0} Loaded {1} found {2} entries".format(now, input_b, len(bib_database.entries)) print(success) else : now = datetime.datetime.now() errs = "{0} Failed to read {1}".format(now, input_b) print(errs) sys.exit(errs) bibtex_str = None if bib_database: writer = BibTexWriter() writer.order_entries_by = ('author', 'year', 'type') bibtex_str = bibtexparser.dumps(bib_database, writer) #print(str(bibtex_str)) with open(output_b, "w") as text_file: #print >> text_file, bibtex_str print(bibtex_str, file=text_file) if bibtex_str: now = datetime.datetime.now() success = "{0} Wrote to {1} with len {2}".format(now, output_b, len(bibtex_str)) print(success) else: now = datetime.datetime.now() errs = "{0} Failed to write {1}".format(now, output_b) print(errs) sys.exit(errs)
'conf-papers.bib', 'articles.bib', 'journal-issues.bib', 'reports.bib' ] for filename in all_bibs: parser = bibtexparser.bparser.BibTexParser(common_strings=True) with open(filename, encoding='utf-8') as bibtex_file: bib_database = bibtexparser.load(bibtex_file, parser=parser) entries = bib_database.entries for entry in entries: if (filename == 'conf-papers.bib'): entry['category'] = 'Conference' elif (filename == 'articles.bib' or filename == 'journal-issues.bib'): entry['category'] = 'Journal' elif (filename == 'reports.bib'): entry['category'] = 'Report' else: print("Unknown filename!") break complete_entries.extend(entries) db = BibDatabase() db.entries = complete_entries writer = BibTexWriter() writer.order_entries_by = ('author') with open('publications.bib', 'w', encoding='utf-8') as bibfile: bibtexparser.dump(db, bibfile, writer)
if bib_database: now = datetime.datetime.now() success = "{0} Loaded {1} found {2} entries".format( now, input_b, len(bib_database.entries)) print(success) else: now = datetime.datetime.now() errs = "{0} Failed to read {1}".format(now, input_b) print(errs) sys.exit(errs) bibtex_str = None if bib_database: writer = BibTexWriter() writer.order_entries_by = ('author', 'year', 'type') bibtex_str = bibtexparser.dumps(bib_database, writer) #print(str(bibtex_str)) with open(output_b, "w") as text_file: print(bibtex_str, file=text_file) if bibtex_str: now = datetime.datetime.now() success = "{0} Wrote to {1} with len {2}".format(now, output_b, len(bibtex_str)) print(success) else: now = datetime.datetime.now() errs = "{0} Failed to write {1}".format(now, output_b) print(errs) sys.exit(errs)
if os.path.exists(folder + '-clean'): print 'cleaning ' + folder + '-clean/' for file in os.listdir(folder + '-clean'): try: if os.path.isfile(folder + '-clean/' + file): os.unlink(folder + '-clean/' + file) except Exception as e: print(e) else: os.makedirs(folder + '-clean') #Writer customization writer = BibTexWriter() writer.contents = ['entries'] writer.indent = ' ' writer.order_entries_by = ('ENTRYTYPE', 'author', 'year') #parser customization, need a new parser for each file #parser = BibTexParser() #parser.common_strings = True #Bib dictionary for months Months = """@STRING{ jan = "jan"} @STRING{ feb = "feb"} @STRING{ mar = "mar"} @STRING{ apr = "apr"} @STRING{ may = "may"} @STRING{ jun = "jun"} @STRING{ jul = "jul"} @STRING{ aug = "aug"} @STRING{ sep = "sep"}
def fix_conference_title_names(clean_text, key_list=None): """ mass bibtex fixes CommandLine: ./fix_bib.py """ # Find citations from the tex documents if key_list is None: key_list = find_used_citations(testdata_fpaths()) key_list = list(set(key_list)) ignore = ['JP', '?'] for item in ignore: try: key_list.remove(item) except ValueError: pass unknown_confkeys = [] conference_keys = [ 'journal', 'booktitle', ] ignore_confkey = [] bib_database = bibtexparser.loads(clean_text) bibtex_dict = bib_database.get_entry_dict() isect = set(ignore_confkey).intersection( set(constants_tex_fixes.CONFERENCE_TITLE_MAPS.keys())) assert len(isect) == 0, repr(isect) #ut.embed() #conftitle_to_types_hist = ut.ddict(list) type_key = 'ENTRYTYPE' debug_author = ut.get_argval('--debug-author', type_=str, default=None) # ./fix_bib.py --debug_author=Kappes for key in bibtex_dict.keys(): entry = bibtex_dict[key] if debug_author is not None: debug = debug_author in entry.get('author', '') else: debug = False if debug: print(' --- ENTRY ---') print(ut.repr3(entry)) #if type_key not in entry: # #entry[type_key] = entry['ENTRYTYPE'] # ut.embed() # Clip abstrat if 'abstract' in entry: entry['abstract'] = ' '.join(entry['abstract'].split(' ')[0:7]) # Remove Keys remove_keys = [ 'note', 'urldate', 'series', 'publisher', 'isbn', 'editor', 'shorttitle', 'copyright', 'language', 'month', # These will be put back in #'number', #'pages', #'volume', ] entry = ut.delete_dict_keys(entry, remove_keys) # Fix conference names confkeys = list(set(entry.keys()).intersection(set(conference_keys))) #entry = ut.delete_dict_keys(entry, ['abstract']) # TODO: FIX THESE IF NEEDBE #if len(confkeys) == 0: # print(ut.dict_str(entry)) # print(entry.keys()) if len(confkeys) == 1: confkey = confkeys[0] old_confval = entry[confkey] # Remove curly braces old_confval = old_confval.replace('{', '').replace('}', '') if old_confval in ignore_confkey: print(ut.dict_str(entry)) continue new_confval_candiates = [] if old_confval.startswith('arXiv'): continue # for conf_title, patterns in constants_tex_fixes.CONFERENCE_TITLE_MAPS.items(): for conf in constants_tex_fixes.CONFERENCES: if conf.matches(old_confval): conf_title = conf.accro() if debug: print('old_confval = %r' % (old_confval, )) print('conf_title = %r' % (conf_title, )) new_confval = conf_title new_confval_candiates.append(new_confval) if len(new_confval_candiates) == 0: new_confval = None elif len(new_confval_candiates) == 1: new_confval = new_confval_candiates[0] else: assert False, 'double match' if new_confval is None: if key in key_list: unknown_confkeys.append(old_confval) #print(old_confval) else: # Overwrite old confval entry[confkey] = new_confval # Record info about types of conferneces true_confval = entry[confkey].replace('{', '').replace('}', '') # FIX ENTRIES THAT SHOULD BE CONFERENCES if true_confval in constants_tex_fixes.CONFERENCE_LIST: if entry[type_key] == 'inproceedings': pass #print(confkey) #print(ut.dict_str(entry)) elif entry[type_key] == 'article': entry['booktitle'] = entry['journal'] del entry['journal'] #print(ut.dict_str(entry)) elif entry[type_key] == 'incollection': pass else: raise AssertionError('UNKNOWN TYPE: %r' % (entry[type_key], )) if 'booktitle' not in entry: print('DOES NOT HAVE CORRECT CONFERENCE KEY') print(ut.dict_str(entry)) assert 'journal' not in entry, 'should not have journal' #print(entry['type']) entry[type_key] = 'inproceedings' # FIX ENTRIES THAT SHOULD BE JOURNALS if true_confval in constants_tex_fixes.JOURNAL_LIST: if entry[type_key] == 'article': pass elif entry[type_key] == 'inproceedings': pass #print(ut.dict_str(entry)) elif entry[type_key] == 'incollection': pass else: raise AssertionError('UNKNOWN TYPE: %r' % (entry['type'], )) if 'journal' not in entry: print('DOES NOT HAVE CORRECT CONFERENCE KEY') print(ut.dict_str(entry)) assert 'booktitle' not in entry, 'should not have booktitle' #print(entry['type']) #entry['type'] = 'article' #conftitle_to_types_hist[true_confval].append(entry['type']) elif len(confkeys) > 1: raise AssertionError('more than one confkey=%r' % (confkeys, )) # Fix Authors if 'author' in entry: authors = six.text_type(entry['author']) for truename, alias_list in constants_tex_fixes.AUTHOR_NAME_MAPS.items( ): pattern = six.text_type( ut.regex_or([ ut.util_regex.whole_word(alias) for alias in alias_list ])) authors = re.sub(pattern, six.text_type(truename), authors, flags=re.UNICODE) entry['author'] = authors """ article = journal inprocedings = converence paper """ #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()} #print(ut.dict_str(conftitle_to_types_set_hist)) print(ut.list_str(sorted(unknown_confkeys))) print('len(unknown_confkeys) = %r' % (len(unknown_confkeys), )) writer = BibTexWriter() writer.contents = ['comments', 'entries'] writer.indent = ' ' writer.order_entries_by = ('type', 'author', 'year') new_bibtex_str = bibtexparser.dumps(bib_database, writer) return new_bibtex_str
def main(bib_fpath=None): r""" intro point to fixbib script CommmandLine: fixbib python -m fixtex bib python -m fixtex bib --dryrun python -m fixtex bib --dryrun --debug """ if bib_fpath is None: bib_fpath = 'My Library.bib' # DEBUG = ub.argflag('--debug') # Read in text and ensure ascii format dirty_text = ut.readfrom(bib_fpath) from fixtex.fix_tex import find_used_citations, testdata_fpaths if exists('custom_extra.bib'): extra_parser = bparser.BibTexParser(ignore_nonstandard_types=False) parser = bparser.BibTexParser() ut.delete_keys(parser.alt_dict, ['url', 'urls']) print('Parsing extra bibtex file') extra_text = ut.readfrom('custom_extra.bib') extra_database = extra_parser.parse(extra_text, partial=False) print('Finished parsing extra') extra_dict = extra_database.get_entry_dict() else: extra_dict = None #udata = dirty_text.decode("utf-8") #dirty_text = udata.encode("ascii", "ignore") #dirty_text = udata # parser = bparser.BibTexParser() # bib_database = parser.parse(dirty_text) # d = bib_database.get_entry_dict() print('BIBTEXPARSER LOAD') parser = bparser.BibTexParser(ignore_nonstandard_types=False, common_strings=True) ut.delete_keys(parser.alt_dict, ['url', 'urls']) print('Parsing bibtex file') bib_database = parser.parse(dirty_text, partial=False) print('Finished parsing') bibtex_dict = bib_database.get_entry_dict() old_keys = list(bibtex_dict.keys()) new_keys = [] for key in ub.ProgIter(old_keys, label='fixing keys'): new_key = key new_key = new_key.replace(':', '') new_key = new_key.replace('-', '_') new_key = re.sub('__*', '_', new_key) new_keys.append(new_key) # assert len(ut.find_duplicate_items(new_keys)) == 0, 'new keys created conflict' assert len(ub.find_duplicates(new_keys)) == 0, 'new keys created conflict' for key, new_key in zip(old_keys, new_keys): if key != new_key: entry = bibtex_dict[key] entry['ID'] = new_key bibtex_dict[new_key] = entry del bibtex_dict[key] # The bibtext is now clean. Print it to stdout #print(clean_text) verbose = None if verbose is None: verbose = 1 # Find citations from the tex documents key_list = None if key_list is None: cacher = ub.Cacher('texcite1', enabled=0) data = cacher.tryload() if data is None: fpaths = testdata_fpaths() key_list, inverse = find_used_citations(fpaths, return_inverse=True) # ignore = ['JP', '?', 'hendrick'] # for item in ignore: # try: # key_list.remove(item) # except ValueError: # pass if verbose: print('Found %d citations used in the document' % (len(key_list), )) data = key_list, inverse cacher.save(data) key_list, inverse = data # else: # key_list = None unknown_pubkeys = [] debug_author = ub.argval('--debug-author', default=None) # ./fix_bib.py --debug_author=Kappes if verbose: print('Fixing %d/%d bibtex entries' % (len(key_list), len(bibtex_dict))) # debug = True debug = False if debug_author is not None: debug = False known_keys = list(bibtex_dict.keys()) missing_keys = set(key_list) - set(known_keys) if extra_dict is not None: missing_keys.difference_update(set(extra_dict.keys())) if missing_keys: print('The library is missing keys found in tex files %s' % (ub.repr2(missing_keys), )) # Search for possible typos: candidate_typos = {} sedlines = [] for key in missing_keys: candidates = ut.closet_words(key, known_keys, num=3, subset=True) if len(candidates) > 1: top = candidates[0] if ut.edit_distance(key, top) == 1: # "sed -i -e 's/{}/{}/g' *.tex".format(key, top) import os replpaths = ' '.join( [relpath(p, os.getcwd()) for p in inverse[key]]) sedlines.append("sed -i -e 's/{}/{}/g' {}".format( key, top, replpaths)) candidate_typos[key] = candidates print('Cannot find key = %r' % (key, )) print('Did you mean? %r' % (candidates, )) print('Quick fixes') print('\n'.join(sedlines)) # group by file just = max([0] + list(map(len, missing_keys))) missing_fpaths = [inverse[key] for key in missing_keys] for fpath in sorted(set(ub.flatten(missing_fpaths))): # ut.fix_embed_globals() subkeys = [k for k in missing_keys if fpath in inverse[k]] print('') ut.cprint('--- Missing Keys ---', 'blue') ut.cprint('fpath = %r' % (fpath, ), 'blue') ut.cprint('{} | {}'.format('Missing'.ljust(just), 'Did you mean?'), 'blue') for key in subkeys: print('{} | {}'.format(ut.highlight_text(key.ljust(just), 'red'), ' '.join(candidate_typos[key]))) # for key in list(bibtex_dict.keys()): if extra_dict is not None: # Extra database takes precidence over regular key_list = list(ut.unique(key_list + list(extra_dict.keys()))) for k, v in extra_dict.items(): bibtex_dict[k] = v full = ub.argflag('--full') for key in key_list: try: entry = bibtex_dict[key] except KeyError: continue self = BibTexCleaner(key, entry, full=full) if debug_author is not None: debug = debug_author in entry.get('author', '') if debug: ut.cprint(' --- ENTRY ---', 'yellow') print(ub.repr2(entry, nl=1)) entry = self.fix() # self.clip_abstract() # self.shorten_keys() # self.fix_authors() # self.fix_year() # old_pubval = self.fix_pubkey() # if old_pubval: # unknown_pubkeys.append(old_pubval) # self.fix_arxiv() # self.fix_general() # self.fix_paper_types() if debug: print(ub.repr2(entry, nl=1)) ut.cprint(' --- END ENTRY ---', 'yellow') bibtex_dict[key] = entry unwanted_keys = set(bibtex_dict.keys()) - set(key_list) if verbose: print('Removing unwanted %d entries' % (len(unwanted_keys))) ut.delete_dict_keys(bibtex_dict, unwanted_keys) if 0: d1 = bibtex_dict.copy() full = True for key, entry in d1.items(): self = BibTexCleaner(key, entry, full=full) pub = self.publication() if pub is None: print(self.entry['ENTRYTYPE']) old = self.fix_pubkey() x1 = self._pubval() x2 = self.standard_pubval(full=full) # if x2 is not None and len(x2) > 5: # print(ub.repr2(self.entry)) if x1 != x2: print('x2 = %r' % (x2, )) print('x1 = %r' % (x1, )) print(ub.repr2(self.entry)) # if 'CVPR' in self.entry.get('booktitle', ''): # if 'CVPR' != self.entry.get('booktitle', ''): # break if old: print('old = %r' % (old, )) d1[key] = self.entry if full: d1 = bibtex_dict.copy() import numpy as np import pandas as pd df = pd.DataFrame.from_dict(d1, orient='index') paged_items = df[~pd.isnull(df['pub_accro'])] has_pages = ~pd.isnull(paged_items['pages']) print('have pages {} / {}'.format(has_pages.sum(), len(has_pages))) print(ub.repr2(paged_items[~has_pages]['title'].values.tolist())) entrytypes = dict(list(df.groupby('pub_type'))) if False: # entrytypes['misc'] g = entrytypes['online'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] entrytypes['book'] entrytypes['thesis'] g = entrytypes['article'] g = entrytypes['incollection'] g = entrytypes['conference'] def lookup_pub(e): if e == 'article': return 'journal', 'journal' elif e == 'incollection': return 'booksection', 'booktitle' elif e == 'conference': return 'conference', 'booktitle' return None, None for e, g in entrytypes.items(): print('e = %r' % (e, )) g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] if 'pub_full' in g.columns: place_title = g['pub_full'].tolist() print(ub.repr2(ub.dict_hist(place_title))) else: print('Unknown publications') if 'report' in entrytypes: g = entrytypes['report'] missing = g[pd.isnull(g['title'])] if len(missing): print('Missing Title') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'journal' in entrytypes: g = entrytypes['journal'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['journal'])] if len(missing): print('Missing Journal') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'conference' in entrytypes: g = entrytypes['conference'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['booktitle'])] if len(missing): print('Missing Booktitle') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'incollection' in entrytypes: g = entrytypes['incollection'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['booktitle'])] if len(missing): print('Missing Booktitle') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'thesis' in entrytypes: g = entrytypes['thesis'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['institution'])] if len(missing): print('Missing Institution') print(ub.repr2(missing[['title', 'author']].values.tolist())) # import utool # utool.embed() # Overwrite BibDatabase structure bib_database._entries_dict = bibtex_dict bib_database.entries = list(bibtex_dict.values()) #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()} #print(ub.repr2(conftitle_to_types_set_hist)) print('Unknown conference keys:') print(ub.repr2(sorted(unknown_pubkeys))) print('len(unknown_pubkeys) = %r' % (len(unknown_pubkeys), )) writer = BibTexWriter() writer.contents = ['comments', 'entries'] writer.indent = ' ' writer.order_entries_by = ('type', 'author', 'year') new_bibtex_str = bibtexparser.dumps(bib_database, writer) # Need to check #jegou_aggregating_2012 # Fix the Journal Abreviations # References: # https://www.ieee.org/documents/trans_journal_names.pdf # Write out clean bibfile in ascii format clean_bib_fpath = ub.augpath(bib_fpath.replace(' ', '_'), suffix='_clean') if not ub.argflag('--dryrun'): ut.writeto(clean_bib_fpath, new_bibtex_str)
def main(): import bibtexparser from bibtexparser.bwriter import BibTexWriter with open('ircre.bib', encoding='utf8') as bibtex_file: bib_database = bibtexparser.load(bibtex_file) entries = bib_database.entries print("---------------------------") print("---------------------------") print("---------------------------") print("Total articles number: " + str(len(entries))) print("---------------------------") print("---------------------------") print("---------------------------") writer = BibTexWriter() writer.indent = ' ' writer.order_entries_by = ('order', ) articleentries = [] for i in range(len(entries)): if entries[i]['ENTRYTYPE'] == 'article': articleentries.append(entries[i].copy()) for n in range(len(entries) - 100): i = n + 100 print("---------------------------") print("Entry number: " + str(i)) title = entries[i]['title'] clusterid = entries[i]['clusterid'] print("Title: " + title) print("Cluster ID: " + clusterid) if not clusterid == "unknown": print("hello" + str(i)) try: citations = os.popen('''./scholarpy/scholar.py -c 1 -C ''' + clusterid + ''' |grep -v list |grep Citations''' ).read().strip().split()[-1] except: citations = "unknown" else: citations = "unknown" print("new Citations: " + citations) if 'cited' in entries[i]: oldcitednumber = int(entries[i]['cited']) else: oldcitednumber = 0 print("Old Cited Number: " + str(oldcitednumber)) if not citations == "unknown": citednumber = int(citations) if citednumber > oldcitednumber and ( (citednumber - oldcitednumber) < 8): entries[i]['cited'] = str(citednumber) with open('cited-add-ircre.bib', 'w', encoding='utf8') as newbibfile: bibtexparser.dump(bib_database, newbibfile, writer=writer) os.popen("cp cited-add-ircre.bib tempcited-add-ircre.bib") with open('cited-add-ircre.bib', 'w', encoding='utf8') as newbibfile: bibtexparser.dump(bib_database, newbibfile, writer=writer) return 0