def main():
    args = parser.parse_args()
    if args.output:
        error_out = expander(join(split(args.output)[0], ".bib_errors"))

    logging.info("---------- STARTING Bibtex Clean")
    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    bib_files = retrieval.get_data_files(args.target, ".bib")
    db = BU.parse_bib_files(bib_files, func=custom_clean)

    logging.info("Read %s entries", len(db.entries))
    #Get errors and write them out:
    error_tuples = ERRORS

    if bool(error_tuples) and args.output:
        formatted = "\n".join(
            ["{} : {}".format(x, y) for x, y in error_tuples]) + "\n"
        with open(error_out, 'a') as f:
            f.write(formatted)

    # Write out the actual bibtex
    if args.output:
        logging.info("Writing out Cleaned Bibliography")
        writer = JGBibTexWriter()
        out_str = writer.write(db)
        with open(args.output, 'w') as f:
            f.write(out_str)
def main():
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    output_path = join(args.output, "{}_summary".format(args.tag))

    bibtex_files = retrieval.get_data_files(args.target, ".bib")
    db = b.bibdatabase.BibDatabase()
    BU.parse_bib_files(bibtex_files, func=bib_proc.tag_summary, database=db)

    entries_with_tag = [x for x in db.entries if args.tag in x['tags']]
    entries_by_year = sorted(entries_with_tag, key=lambda x: x['year'])
    pdfs_to_process = [x['file'] for x in entries_by_year]
    expanded_paths = [abspath(expanduser(x)) for x in pdfs_to_process]
    logging.info("Summarising {} pdfs".format(len(expanded_paths)))
    PU.summarise_pdfs(expanded_paths, output=output_path, bound=args.bound)
Exemple #3
0
def main():
    #see https://docs.python.org/3/howto/argparse.html
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                   epilog = "\n".join(["Bibtex Tag Indexer"]))
    parser.add_argument('--target', action="append", required=True)
    parser.add_argument('--output', default="~/github/writing/resources/cron_reports/tag_bibtex.index")

    args = parser.parse_args()
    if not bool(args.target):
        args.target = ["/Volumes/documents/github/writing/resources/bibliography"]

    bibs = RET.collect_files(args.target)[0]

    # Load bibs
    db = bib_parse.parse_bib_files(bibs, func=bib_proc.tags)

    # map to tags
    index = IndexFile()
    for entry in db.entries:
        for tag in entry['tags']:
            # TODO get all the `file[digit]` keys
            if 'file' in entry:
                index.add_files(tag, entry['file'])


    # Write out index
    out_string = str(index)
    with open(abspath(expanduser(args.output)), 'w') as f:
        f.write(out_string)
Exemple #4
0
def main():
    args = parser.parse_args()
    args.library = [abspath(expanduser(x)) for x in args.library]
    args.output = abspath(expanduser(args.output))
    logging.info("---------- STARTING Bibtex Timelines")
    if not exists(args.output):
        logging.info("Making output: {}".format(args.output))
        mkdir(args.output)

    assert (exists(args.output))

    all_bibs = retrieval.get_data_files(args.library, ".bib")

    logging.info("Found {} bib files".format(len(all_bibs)))

    db = b.bibdatabase.BibDatabase()
    BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db)

    logging.info("Loaded bibtex entries: {}".format(len(db.entries)))

    # Load totals_bib.tags
    # Filter for min_entries

    # Create a TimelineFile for each tag
    # Add citations to each tag TimelineFile
    # Write out timeline files

    tag_collection = defaultdict(list)
    for entry in db.entries:
        tags = entry['tags']
        for tag in tags:
            tag_collection[tag].append(entry)

    logging.info("Collected Tags: {}".format(len(tag_collection)))

    # Then sort by year and write out
    for tag, entries in tag_collection.items():
        out_target = join(args.output, "{}.tag_timeline".format(tag))
        sorted_entries = sorted(entries, key=lambda x: x['year'])

        if len(sorted_entries) > args.min_entries:
            with open(out_target, 'w') as f:
                f.write("\n".join([
                    "{} {}".format(x['year'].strftime("%Y"), x['ID'])
                    for x in sorted_entries
                ]))
def main():
    logging.info("---------- STARTING Tag Totals")
    cli_args = parser.parse_args()
    cli_args.output = abspath(expanduser(cli_args.output))

    logging.info("Targeting: {}".format(cli_args.target))
    if isdir(cli_args.output) and not exists(cli_args.output):
        mkdir(cli_args.output)
    if isdir(cli_args.output):
        cli_args.output = join(cli_args.output, "tags")
    logging.info("Output to: {}".format(cli_args.output))
    logging.info("Cleaned Tags locations: {}".format(cli_args.cleaned))

    bibs, htmls, orgs, bkmks = retrieval.collect_files(cli_args.target)
    bib_db = BU.parse_bib_files(bibs, func=bib_proc.tags)
    tag_graph = TagGraph()

    bib_tags = tag_graph.extract_bibtex(bib_db)
    org_tags = tag_graph.extract_org(orgs)
    bkmk_tags = tag_graph.extract_bookmark(bkmks)

    with open(cli_args.output + "_bib.tags", 'w') as f:
        f.write(str(bib_tags))

    with open(cli_args.output + "_org.tags", 'w') as f:
        f.write(str(org_tags))

    with open(cli_args.output + "_bkmk.tags", 'w') as f:
        f.write(str(bkmk_tags))

    with open(cli_args.output + "_total.tags", 'w') as f:
        f.write(str(tag_graph))

    logging.info("Completed Total Count --------------------")

    if not bool(cli_args.cleaned):
        sys.exit()

    # load existing tag files
    cleaned = SubstitutionFile.builder(cli_args.cleaned)

    # get new tags
    tags: TagFile = tag_graph.tags
    new_tags: TagFile = cleaned.difference(tags)

    # group them separately, alphabeticaly
    # To be included in the separate tag files
    with open(cli_args.output + "_new.tags", 'w') as f:
        f.write(str(new_tags))

    logging.info("Completed Uncleaned Count --------------------")
def main():
    args = parser.parse_args()
    args.library = [abspath(expanduser(x)) for x in args.library]
    args.output = abspath(expanduser(args.output))

    all_bibs = retrieval.get_data_files(args.library, ".bib")
    logging.info("Found {} bib files".format(len(all_bibs)))
    db = b.bibdatabase.BibDatabase()
    BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db)
    logging.info("Loaded bibtex entries: {}".format(len(db.entries)))

    # Graph tags over time
    year_counts = []
    if args.tag:
        year_counts = get_tag_across_years(db, args.tag)
    else:
        year_counts: List[Tuple[datetime, int]] = get_entries_across_years(db)

    year_counts = [x for x in year_counts if x[1] > 5]
    # chart the tweets
    to_draw = [("Years", year_counts)]
    # n_rows = int(len(to_draw) / 2)
    # n_cols = int(len(to_draw) / 2)
    n_rows = 1
    n_cols = 1
    for count, paired in enumerate(to_draw):
        name, data = paired
        logging.info("Drawing {}".format(name))
        x = [x[0] for x in data]
        y = [x[1] for x in data]
        plt.subplot(n_rows, n_cols, count + 1)
        plt.scatter(x, y, alpha=0.3)
        plt.title(name)
        plt.gcf().autofmt_xdate()

    logging.info("Finished, saving")
    plt.savefig(args.output)
    plt.show()
Exemple #7
0
def main():
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    #load each of the specified files
    target_files = retrieval.get_data_files(args.target, ".bib")
    dbs = [BU.parse_bib_files(x, bib_proc.nop) for x in target_files]

    main_db = b.bibdatabase.BibDatabase()
    # Load the main database
    if exists(args.output):
        BU.parse_bib_files(args.output, bib_proc.nop, database=main_db)

    main_set = set(main_db.get_entry_dict().keys())
    total_entries = main_db.entries[:]
    missing_keys_main = set()

    # Get entries missing from the main database
    for db in dbs:
        db_dict = db.get_entry_dict()
        db_set = set(db_dict.keys())
        missing_keys = db_set.difference(main_set)
        missing_keys_main.update(missing_keys)
        total_entries += [db_dict[x] for x in missing_keys]

    logging.info("{} missing entries".format(len(total_entries)))
    main_db.entries = total_entries

    # Write out the combined database
    logging.info("Bibtex loaded")
    writer = BibTexWriter()
    writer.align_values = True
    with open(join(args.output), 'a') as f:
        f.write(writer.write(main_db))
def main():
    logging.info("---------- Tag Graphing")
    args = parser.parse_args()

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    bibs, htmls, orgs, bkmks = retrieval.collect_files(args.target)
    bib_db = BU.parse_bib_files(bibs, func=bib_proc.tags)
    main_graph = TR.TagGraph()

    main_graph.extract_bibtex(bib_db)
    main_graph.extract_org(orgs)
    main_graph.extract_bookmark(bkmks)

    main_graph.write(args.output)

    logging.info("Complete --------------------")
def main():
    args = parser.parse_args()

    args.output = abspath(expanduser(args.output))
    assert (exists(args.target))

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    # Load targets
    bib_files = retrieval.get_data_files(args.target, ".bib")
    db = BU.parse_bib_files(bib_files, func=bib_proc.author_extract)
    logging.info("Bibtex loaded")

    logging.info(f"Processing Entries: {len(db.entries)}")
    result = process_db(db)

    # write the output
    with open(args.output, 'w') as f:
        f.write("\n".join(result))

    logging.info("Complete")
Exemple #10
0
def main():
    args = parser.parse_args()

    args.output = abspath(expanduser(args.output))
    assert (exists(args.target))

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    # Load targets
    bib_files = retrieval.get_data_files(args.target, ".bib")
    db = BU.parse_bib_files(bib_files, func=bib_proc.clean_full)
    logging.info("Bibtex loaded")

    logging.info(f"Processing Entries: {len(db.entries)}")
    result = process_db(db)
    logging.info("Processing complete")

    with open(join(args.output, "bibtex.years"), 'w') as f:
        f.write(str(result.all_years))

    with open(join(args.output, "bibtex.authors"), 'w') as f:
        f.write(str(result.author_counts))

    with open(join(args.output, "bibtex.no_file"), 'w') as f:
        f.write("\n".join(result.no_file))

    with open(join(args.output, "bibtex.missing_file"), 'w') as f:
        f.write("\n".join(result.missing_files))

    with open(join(args.output, "bibtex.duplicates"), 'w') as f:
        f.write("\n".join(result.duplicates))

    with open(join(args.output, "bibtex.untagged"), 'w') as f:
        f.write("\n".join(result.non_tagged))

    logging.info("Complete")
Exemple #11
0
def main():
    args = parser.parse_args()

    args.output = abspath(expanduser(args.output))
    assert(isdir(args.library))
    assert(isdir(args.output))
    assert(isdir(args.target))

    # Get targets
    all_bibs = retrieval.get_data_files(args.library, ".bib")
    main_db = BU.parse_bib_files(all_bibs)

    logging.info("Loaded Database: {} entries".format(len(main_db.entries)))
    count              = 0
    all_file_mentions  = []
    all_existing_files = retrieval.get_data_files(args.target, [".epub", ".pdf"], normalize=True)

    # Convert entries to unicode
    for i, entry in enumerate(main_db.entries):
        if i % 10 == 0:
            logging.info("{}/10 Complete".format(count))
            count += 1
        unicode_entry = b.customization.convert_to_unicode(entry)

        entry_keys = [x for x in unicode_entry.keys() if FILE_RE.search(x)]
        for k in entry_keys:
            all_file_mentions.append(normalize('NFD', unicode_entry[k]))


    logging.info("Found {} files mentioned in bibliography".format(len(all_file_mentions)))
    logging.info("Found {} files existing".format(len(all_existing_files)))

    logging.info("Normalizing paths")
    norm_mentions = set([])
    # Normalise all paths in bibtex entries
    for x in all_file_mentions:
        path = PATH_NORM.sub("", x)
        if path in norm_mentions:
            logging.info("Duplicate file mention: {}".format(path))
        else:
            norm_mentions.add(path)

    norm_existing = set([])
    # Remove duplicates mentions
    for x in all_existing_files:
        path = PATH_NORM.sub("", x)
        if path in norm_existing:
            logging.info("Duplicate file existence: {}".format(path))
        else:
            norm_existing.add(path)

    logging.info("Normalized paths")

    mentioned_non_existent = norm_mentions - norm_existing
    existing_not_mentioned = norm_existing - norm_mentions

    logging.info("Mentioned but not existing: {}".format(len(mentioned_non_existent)))
    logging.info("Existing but not mentioned: {}".format(len(existing_not_mentioned)))

    # Create output files
    with open(join(args.output, "bibtex.not_existing"),'w') as f:
        f.write("\n".join(mentioned_non_existent))

    with open(join(args.output, "bibtex.not_mentioned"), 'w') as f:
        f.write("\n".join(existing_not_mentioned))