def main(): args = parser.parse_args() if args.output: error_out = expander(join(split(args.output)[0], ".bib_errors")) logging.info("---------- STARTING Bibtex Clean") logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) bib_files = retrieval.get_data_files(args.target, ".bib") db = BU.parse_bib_files(bib_files, func=custom_clean) logging.info("Read %s entries", len(db.entries)) #Get errors and write them out: error_tuples = ERRORS if bool(error_tuples) and args.output: formatted = "\n".join( ["{} : {}".format(x, y) for x, y in error_tuples]) + "\n" with open(error_out, 'a') as f: f.write(formatted) # Write out the actual bibtex if args.output: logging.info("Writing out Cleaned Bibliography") writer = JGBibTexWriter() out_str = writer.write(db) with open(args.output, 'w') as f: f.write(out_str)
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) output_path = join(args.output, "{}_summary".format(args.tag)) bibtex_files = retrieval.get_data_files(args.target, ".bib") db = b.bibdatabase.BibDatabase() BU.parse_bib_files(bibtex_files, func=bib_proc.tag_summary, database=db) entries_with_tag = [x for x in db.entries if args.tag in x['tags']] entries_by_year = sorted(entries_with_tag, key=lambda x: x['year']) pdfs_to_process = [x['file'] for x in entries_by_year] expanded_paths = [abspath(expanduser(x)) for x in pdfs_to_process] logging.info("Summarising {} pdfs".format(len(expanded_paths))) PU.summarise_pdfs(expanded_paths, output=output_path, bound=args.bound)
def main(): #see https://docs.python.org/3/howto/argparse.html parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog = "\n".join(["Bibtex Tag Indexer"])) parser.add_argument('--target', action="append", required=True) parser.add_argument('--output', default="~/github/writing/resources/cron_reports/tag_bibtex.index") args = parser.parse_args() if not bool(args.target): args.target = ["/Volumes/documents/github/writing/resources/bibliography"] bibs = RET.collect_files(args.target)[0] # Load bibs db = bib_parse.parse_bib_files(bibs, func=bib_proc.tags) # map to tags index = IndexFile() for entry in db.entries: for tag in entry['tags']: # TODO get all the `file[digit]` keys if 'file' in entry: index.add_files(tag, entry['file']) # Write out index out_string = str(index) with open(abspath(expanduser(args.output)), 'w') as f: f.write(out_string)
def main(): args = parser.parse_args() args.library = [abspath(expanduser(x)) for x in args.library] args.output = abspath(expanduser(args.output)) logging.info("---------- STARTING Bibtex Timelines") if not exists(args.output): logging.info("Making output: {}".format(args.output)) mkdir(args.output) assert (exists(args.output)) all_bibs = retrieval.get_data_files(args.library, ".bib") logging.info("Found {} bib files".format(len(all_bibs))) db = b.bibdatabase.BibDatabase() BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db) logging.info("Loaded bibtex entries: {}".format(len(db.entries))) # Load totals_bib.tags # Filter for min_entries # Create a TimelineFile for each tag # Add citations to each tag TimelineFile # Write out timeline files tag_collection = defaultdict(list) for entry in db.entries: tags = entry['tags'] for tag in tags: tag_collection[tag].append(entry) logging.info("Collected Tags: {}".format(len(tag_collection))) # Then sort by year and write out for tag, entries in tag_collection.items(): out_target = join(args.output, "{}.tag_timeline".format(tag)) sorted_entries = sorted(entries, key=lambda x: x['year']) if len(sorted_entries) > args.min_entries: with open(out_target, 'w') as f: f.write("\n".join([ "{} {}".format(x['year'].strftime("%Y"), x['ID']) for x in sorted_entries ]))
def main(): logging.info("---------- STARTING Tag Totals") cli_args = parser.parse_args() cli_args.output = abspath(expanduser(cli_args.output)) logging.info("Targeting: {}".format(cli_args.target)) if isdir(cli_args.output) and not exists(cli_args.output): mkdir(cli_args.output) if isdir(cli_args.output): cli_args.output = join(cli_args.output, "tags") logging.info("Output to: {}".format(cli_args.output)) logging.info("Cleaned Tags locations: {}".format(cli_args.cleaned)) bibs, htmls, orgs, bkmks = retrieval.collect_files(cli_args.target) bib_db = BU.parse_bib_files(bibs, func=bib_proc.tags) tag_graph = TagGraph() bib_tags = tag_graph.extract_bibtex(bib_db) org_tags = tag_graph.extract_org(orgs) bkmk_tags = tag_graph.extract_bookmark(bkmks) with open(cli_args.output + "_bib.tags", 'w') as f: f.write(str(bib_tags)) with open(cli_args.output + "_org.tags", 'w') as f: f.write(str(org_tags)) with open(cli_args.output + "_bkmk.tags", 'w') as f: f.write(str(bkmk_tags)) with open(cli_args.output + "_total.tags", 'w') as f: f.write(str(tag_graph)) logging.info("Completed Total Count --------------------") if not bool(cli_args.cleaned): sys.exit() # load existing tag files cleaned = SubstitutionFile.builder(cli_args.cleaned) # get new tags tags: TagFile = tag_graph.tags new_tags: TagFile = cleaned.difference(tags) # group them separately, alphabeticaly # To be included in the separate tag files with open(cli_args.output + "_new.tags", 'w') as f: f.write(str(new_tags)) logging.info("Completed Uncleaned Count --------------------")
def main(): args = parser.parse_args() args.library = [abspath(expanduser(x)) for x in args.library] args.output = abspath(expanduser(args.output)) all_bibs = retrieval.get_data_files(args.library, ".bib") logging.info("Found {} bib files".format(len(all_bibs))) db = b.bibdatabase.BibDatabase() BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db) logging.info("Loaded bibtex entries: {}".format(len(db.entries))) # Graph tags over time year_counts = [] if args.tag: year_counts = get_tag_across_years(db, args.tag) else: year_counts: List[Tuple[datetime, int]] = get_entries_across_years(db) year_counts = [x for x in year_counts if x[1] > 5] # chart the tweets to_draw = [("Years", year_counts)] # n_rows = int(len(to_draw) / 2) # n_cols = int(len(to_draw) / 2) n_rows = 1 n_cols = 1 for count, paired in enumerate(to_draw): name, data = paired logging.info("Drawing {}".format(name)) x = [x[0] for x in data] y = [x[1] for x in data] plt.subplot(n_rows, n_cols, count + 1) plt.scatter(x, y, alpha=0.3) plt.title(name) plt.gcf().autofmt_xdate() logging.info("Finished, saving") plt.savefig(args.output) plt.show()
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) #load each of the specified files target_files = retrieval.get_data_files(args.target, ".bib") dbs = [BU.parse_bib_files(x, bib_proc.nop) for x in target_files] main_db = b.bibdatabase.BibDatabase() # Load the main database if exists(args.output): BU.parse_bib_files(args.output, bib_proc.nop, database=main_db) main_set = set(main_db.get_entry_dict().keys()) total_entries = main_db.entries[:] missing_keys_main = set() # Get entries missing from the main database for db in dbs: db_dict = db.get_entry_dict() db_set = set(db_dict.keys()) missing_keys = db_set.difference(main_set) missing_keys_main.update(missing_keys) total_entries += [db_dict[x] for x in missing_keys] logging.info("{} missing entries".format(len(total_entries))) main_db.entries = total_entries # Write out the combined database logging.info("Bibtex loaded") writer = BibTexWriter() writer.align_values = True with open(join(args.output), 'a') as f: f.write(writer.write(main_db))
def main(): logging.info("---------- Tag Graphing") args = parser.parse_args() logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) bibs, htmls, orgs, bkmks = retrieval.collect_files(args.target) bib_db = BU.parse_bib_files(bibs, func=bib_proc.tags) main_graph = TR.TagGraph() main_graph.extract_bibtex(bib_db) main_graph.extract_org(orgs) main_graph.extract_bookmark(bkmks) main_graph.write(args.output) logging.info("Complete --------------------")
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert (exists(args.target)) logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) # Load targets bib_files = retrieval.get_data_files(args.target, ".bib") db = BU.parse_bib_files(bib_files, func=bib_proc.author_extract) logging.info("Bibtex loaded") logging.info(f"Processing Entries: {len(db.entries)}") result = process_db(db) # write the output with open(args.output, 'w') as f: f.write("\n".join(result)) logging.info("Complete")
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert (exists(args.target)) logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) # Load targets bib_files = retrieval.get_data_files(args.target, ".bib") db = BU.parse_bib_files(bib_files, func=bib_proc.clean_full) logging.info("Bibtex loaded") logging.info(f"Processing Entries: {len(db.entries)}") result = process_db(db) logging.info("Processing complete") with open(join(args.output, "bibtex.years"), 'w') as f: f.write(str(result.all_years)) with open(join(args.output, "bibtex.authors"), 'w') as f: f.write(str(result.author_counts)) with open(join(args.output, "bibtex.no_file"), 'w') as f: f.write("\n".join(result.no_file)) with open(join(args.output, "bibtex.missing_file"), 'w') as f: f.write("\n".join(result.missing_files)) with open(join(args.output, "bibtex.duplicates"), 'w') as f: f.write("\n".join(result.duplicates)) with open(join(args.output, "bibtex.untagged"), 'w') as f: f.write("\n".join(result.non_tagged)) logging.info("Complete")
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert(isdir(args.library)) assert(isdir(args.output)) assert(isdir(args.target)) # Get targets all_bibs = retrieval.get_data_files(args.library, ".bib") main_db = BU.parse_bib_files(all_bibs) logging.info("Loaded Database: {} entries".format(len(main_db.entries))) count = 0 all_file_mentions = [] all_existing_files = retrieval.get_data_files(args.target, [".epub", ".pdf"], normalize=True) # Convert entries to unicode for i, entry in enumerate(main_db.entries): if i % 10 == 0: logging.info("{}/10 Complete".format(count)) count += 1 unicode_entry = b.customization.convert_to_unicode(entry) entry_keys = [x for x in unicode_entry.keys() if FILE_RE.search(x)] for k in entry_keys: all_file_mentions.append(normalize('NFD', unicode_entry[k])) logging.info("Found {} files mentioned in bibliography".format(len(all_file_mentions))) logging.info("Found {} files existing".format(len(all_existing_files))) logging.info("Normalizing paths") norm_mentions = set([]) # Normalise all paths in bibtex entries for x in all_file_mentions: path = PATH_NORM.sub("", x) if path in norm_mentions: logging.info("Duplicate file mention: {}".format(path)) else: norm_mentions.add(path) norm_existing = set([]) # Remove duplicates mentions for x in all_existing_files: path = PATH_NORM.sub("", x) if path in norm_existing: logging.info("Duplicate file existence: {}".format(path)) else: norm_existing.add(path) logging.info("Normalized paths") mentioned_non_existent = norm_mentions - norm_existing existing_not_mentioned = norm_existing - norm_mentions logging.info("Mentioned but not existing: {}".format(len(mentioned_non_existent))) logging.info("Existing but not mentioned: {}".format(len(existing_not_mentioned))) # Create output files with open(join(args.output, "bibtex.not_existing"),'w') as f: f.write("\n".join(mentioned_non_existent)) with open(join(args.output, "bibtex.not_mentioned"), 'w') as f: f.write("\n".join(existing_not_mentioned))