Ejemplo n.º 1
0
def main():
    # Setup
    args = parser.parse_args()
    args.source = [abspath(expanduser(x)) for x in args.source]
    args.library = [abspath(expanduser(x)) for x in args.library]
    if args.exclude is None:
        args.exclude = []

    args.exclude = [abspath(expanduser(x)) for x in args.exclude]

    if not args.record:
        args.record = join(args.library[0], "update_record")

    logging.info(f"Update Record: {args.record}")
    assert (exists(args.record))
    if any([not exists(x) for x in args.source + args.library]):
        raise Exception('Source and Output need to exist')

    #load the newly parsed org names
    # { file_name : full_path }
    newly_parsed = sorted(retrieval.get_data_files(args.source, ext=".org"))

    logging.info("Newly parsed to transfer: {}".format(len(newly_parsed)))

    #get the existing org names, as a dict with its location
    library_orgs = retrieval.get_data_files(args.library, ext=".org")
    existing_orgs = {}
    for lib_org in library_orgs:
        if lib_org in args.exclude:
            continue

        existing_orgs[split(lib_org)[1]] = split(lib_org)[0]

    logging.info("Existing orgs: {}".format(len(existing_orgs)))

    totally_new = []
    #now update existing with the new

    for x in newly_parsed:
        if split(x)[1] not in existing_orgs:
            logging.info("Found a completely new user: {}".format(x))
            totally_new.append(x)
            continue

        integrate(x, existing_orgs)

    logging.info("Completely new to transfer: {}".format(len(totally_new)))

    # Now copy completely new files
    for x in sorted(totally_new):
        copy_new(x, args.library[0])

    update_record(args.record, args.source)
    system('say -v Moira -r 50 "Finished Integrating"')
Ejemplo n.º 2
0
    def build(json_dir):
        """ Create a graph of tweet replies and quotes """
        logging.info("Assembling threads graph from: {}".format(json_dir))
        json_files = get_data_files(json_dir, ext=".json")
        di_graph = nx.DiGraph()
        for jfile in json_files:
            # load in each json,
            with open(jfile, 'r') as f:
                data = json.load(f, strict=False)

            # construct connection graph
            for entry in data:
                # get tweet id, reply_id, quote_id
                tweet_id = entry['id_str']
                di_graph.add_node(tweet_id, source_file=jfile)

                if 'in_reply_to_status_id_str' in entry and entry[
                        'in_reply_to_status_id_str']:
                    # link tweets
                    di_graph.add_edge(tweet_id,
                                      str(entry['in_reply_to_status_id_str']),
                                      type="reply")

                if 'quoted_status_id_str' in entry and entry[
                        'quoted_status_id_str']:
                    di_graph.add_edge(tweet_id,
                                      str(entry['quoted_status_id_str']),
                                      type="quote")

        return TwitterGraph(di_graph)
Ejemplo n.º 3
0
def main():
    logging.info("---------- STARTING Tag Indexer")
    args = parser.parse_args()
    args.target = [abspath(expanduser(x)) for x in args.target]
    args.output = abspath(expanduser(args.output))

    targets = get_data_files(args.target, ext=".org")

    index = IndexFile()

    for filename in targets:
        # read in
        lines = []
        with open(filename, 'r') as f:
            lines = f.readlines()

        # headlines with tags
        matched = [TAG_LINE.match(x) for x in lines]
        actual = [x for x in matched if bool(x)]
        tags = [y for x in actual for y in x[1].split(":") if bool(x)]
        # add to index
        for tag in tags:
            index.add_files(tag, [filename])

    # Write out index
    out_string = str(index)
    with open(args.output, 'w') as f:
        f.write(out_string)

    logging.info("Tag Indexing Finished")
Ejemplo n.º 4
0
def main():
    args = parser.parse_args()
    if args.output:
        error_out = expander(join(split(args.output)[0], ".bib_errors"))

    logging.info("---------- STARTING Bibtex Clean")
    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    bib_files = retrieval.get_data_files(args.target, ".bib")
    db = BU.parse_bib_files(bib_files, func=custom_clean)

    logging.info("Read %s entries", len(db.entries))
    #Get errors and write them out:
    error_tuples = ERRORS

    if bool(error_tuples) and args.output:
        formatted = "\n".join(
            ["{} : {}".format(x, y) for x, y in error_tuples]) + "\n"
        with open(error_out, 'a') as f:
            f.write(formatted)

    # Write out the actual bibtex
    if args.output:
        logging.info("Writing out Cleaned Bibliography")
        writer = JGBibTexWriter()
        out_str = writer.write(db)
        with open(args.output, 'w') as f:
            f.write(out_str)
Ejemplo n.º 5
0
def main():
    logging.info("---------- STARTING Twitter Indexer")
    args = parser.parse_args()
    args.target = [abspath(expanduser(x)) for x in args.target]
    args.output = abspath(expanduser(args.output))

    targets = get_data_files(args.target, ext=".org")

    index = IndexFile()

    for filename in targets:
        # read in
        lines = []
        with open(filename, 'r') as f:
            lines = f.readlines()

        # PERMALINK
        matched = [PERMALINK.match(x) for x in lines]
        users   = ["@"+x[1] for x in matched if bool(x)]
        # add to index
        for user in users:
            index.add_files(user, [filename])

    # Write out index
    out_string = str(index)
    with open(args.output, 'w') as f:
        f.write(out_string)

    logging.info("Twitter Indexing Finished")
Ejemplo n.º 6
0
def main():
    logging.info("Grepping for Tags")
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    # Collect files to process
    lib = retrieval.get_data_files(args.library, ext=".org")
    # Get tag set
    tags = TagFile.builder(args.target)

    batch_count = int(len(lib) / args.file_batch)
    processed_tags = TagFile.builder(args.output, ext=".index").to_set()

    # fail out if a lock file exists
    if exists(args.output + ".lock"):
        logging.warning("Lock File Exists")
        sys.exit()

    open(args.output + ".lock", 'w').close()
    assert (exists(args.output + ".lock"))

    remaining_keys = list(set(tags.count.keys()).difference(processed_tags))

    logging.info(
        f"Total/Processed/Remaining: {len(tags)}/{len(processed_tags)}/{len(remaining_keys)}"
    )
    logging.debug(f"Processed: {processed_tags}")

    for i, tag in enumerate(remaining_keys[:args.tag_batch]):
        index_additions = IndexFile()
        ## batch filter files that mention the tag
        logging.info(f"-- Tag: {tag} {i}/{len(tags)}")
        batch_num = 0
        for start in range(0, len(lib), args.file_batch):
            logging.info(f"File Batch: {batch_num}/{batch_count}")
            result = run(
                ['grep', '-l', tag, *lib[start:start + args.file_batch]],
                capture_output=True)
            if result.returncode == 0 and bool(result.stdout):
                to_add: List = [
                    x.strip() for x in result.stdout.decode().split("\n")
                ]
                shortened = [
                    x[len(args.target[0]):] if args.target[0] in x else x
                    for x in to_add
                ]
                index_additions.add_files(tag, shortened)

            batch_num += 1

        # add new tag->file mappings to the index
        if bool(index_additions):
            logging.info(f"Writing to file: {len(index_additions)}")
            with open(args.output, 'a') as f:
                f.write("\n")
                f.write(str(index_additions))

    remove(args.output + ".lock")
    logging.info("Finished")
Ejemplo n.º 7
0
    def builder(cls, target, ext=None) -> 'BaseFileFormat':
        """
        Build an tag file from a target directory or file
        """
        main = cls()
        ext = ext or main.ext
        for t in get_data_files(target, ext):
            main += cls.read(t)

        return main
Ejemplo n.º 8
0
    def builder(target, sep=None) -> IndexFile:
        """
        Build an index file from a target directory or file
        """
        main = IndexFile()
        for target in get_data_files(target, main.ext):
            try:
                main += IndexFile.read(target, sep=sep)
            except Exception as err:
                logging.warning(f"IndexFile.builder failure for {target}")

        return main
Ejemplo n.º 9
0
def main():
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    # TODO, get information from bibtex on each entry, including specific pages
    if args.grouped:
        groups = listdir(args.target)
        for group in groups:
            pdfs_to_process = retrieval.get_data_files(
                join(args.target, group), [".pdf", ".epub"])
            logging.info("Summarising {}'s {} pdfs".format(
                group, len(pdfs_to_process)))
            PU.summarise_pdfs(pdfs_to_process,
                              output="{}_{}".format(args.output, group),
                              bound=int(args.bound))
    else:
        # Find all pdfs in subdir
        pdfs_to_process = retrieval.get_data_files(args.target, ".pdf")
        logging.info("Summarising {} pdfs".format(len(pdfs_to_process)))
        PU.summarise_pdfs(pdfs_to_process,
                          output=args.output,
                          bound=args.bound)
Ejemplo n.º 10
0
def construct_org_files(combined_threads_dir, org_dir, all_users,
                        todo_tag_bindings: TweetTodoFile):
    logging.info("Constructing org files from: {} \n\tto: {}".format(
        combined_threads_dir, org_dir))
    # get all user summary jsons
    user_summaries = get_data_files(combined_threads_dir, ext=".json")

    for summary in user_summaries:
        org_obj = TwitterOrg(summary, org_dir, todo_tag_bindings, all_users)
        if not bool(org_obj):
            logging.warning(f"User Summary Empty: {summary}")
            continue

        org_obj.build_threads()
        org_obj.write()
        org_obj.download_media()
Ejemplo n.º 11
0
def main():
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    output_path = join(args.output, "{}_summary".format(args.tag))

    bibtex_files = retrieval.get_data_files(args.target, ".bib")
    db = b.bibdatabase.BibDatabase()
    BU.parse_bib_files(bibtex_files, func=bib_proc.tag_summary, database=db)

    entries_with_tag = [x for x in db.entries if args.tag in x['tags']]
    entries_by_year = sorted(entries_with_tag, key=lambda x: x['year'])
    pdfs_to_process = [x['file'] for x in entries_by_year]
    expanded_paths = [abspath(expanduser(x)) for x in pdfs_to_process]
    logging.info("Summarising {} pdfs".format(len(expanded_paths)))
    PU.summarise_pdfs(expanded_paths, output=output_path, bound=args.bound)
Ejemplo n.º 12
0
def main():
    args = parser.parse_args()
    args.library = [abspath(expanduser(x)) for x in args.library]
    args.output = abspath(expanduser(args.output))
    logging.info("---------- STARTING Bibtex Timelines")
    if not exists(args.output):
        logging.info("Making output: {}".format(args.output))
        mkdir(args.output)

    assert (exists(args.output))

    all_bibs = retrieval.get_data_files(args.library, ".bib")

    logging.info("Found {} bib files".format(len(all_bibs)))

    db = b.bibdatabase.BibDatabase()
    BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db)

    logging.info("Loaded bibtex entries: {}".format(len(db.entries)))

    # Load totals_bib.tags
    # Filter for min_entries

    # Create a TimelineFile for each tag
    # Add citations to each tag TimelineFile
    # Write out timeline files

    tag_collection = defaultdict(list)
    for entry in db.entries:
        tags = entry['tags']
        for tag in tags:
            tag_collection[tag].append(entry)

    logging.info("Collected Tags: {}".format(len(tag_collection)))

    # Then sort by year and write out
    for tag, entries in tag_collection.items():
        out_target = join(args.output, "{}.tag_timeline".format(tag))
        sorted_entries = sorted(entries, key=lambda x: x['year'])

        if len(sorted_entries) > args.min_entries:
            with open(out_target, 'w') as f:
                f.write("\n".join([
                    "{} {}".format(x['year'].strftime("%Y"), x['ID'])
                    for x in sorted_entries
                ]))
Ejemplo n.º 13
0
def main():
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    assert(isdir(args.output) or not exists(args.output))
    if not exists(args.output):
        mkdir(args.output)

    if not isdir(args.output):
        mkdir(args.output)

    # load source
    lib_files = get_data_files(args.source, ext=".bookmarks")
    library = BookmarkCollection()
    for bkmk_f in lib_files:
        with open(bkmk_f, 'r') as f:
            library.add_file(f)

    domains      = defaultdict(lambda: [])

    # Group urls into domains
    for bkmk in library:
        parsed = urlparse(bkmk.url)

        netloc = CLEAN.sub("", parsed.netloc)

        if "github" in netloc:
            domains["github"].append(bkmk)
        elif "itch.io" in netloc:
            domains["itchio"].append(bkmk)
        else:
            domains[netloc].append(bkmk)

    logging.info(f"Grouped into {len(domains)} domains")
    #save
    groups = "\n".join(domains.keys())
    with open(join(args.output, "netlocs.list"), 'w') as f:
        f.write(groups)

    for domain, bkmks in domains.items():
        flattened = domain.replace(".", "_")
        bkmks_s = "\n".join([str(x) for x in sorted(bkmks)])
        with open(join(args.output, f"{flattened}.bookmarks"), 'w') as f:
            f.write(bkmks_s)
Ejemplo n.º 14
0
def main():
    args = parser.parse_args()
    args.target = [abspath(expanduser(x)) for x in args.target]

    logging.info("Targeting: {}".format(args.target))

    # Get Bibtex files
    all_bib_paths = retrieval.get_data_files(args.targeet, ".bib")
    all_dbs = []
    for t in all_bib_paths:
        # Use a new bib_parser for each so library isn't shared
        bib_parser = BibTexParser(common_strings=False)
        bib_parser.ignore_nonstandard_types = False
        bib_parser.homogenise_fields = True

        with open(t, 'r') as f:
            db = b.load(f, bib_parser)
            all_dbs.append(db)

    logging.info("DB Sizes: {}".format(", ".join(
        [str(len(x.entries)) for x in all_dbs])))

    # Sort the bibtex's by their size
    sorted_dbs = sorted([(len(x.entries), x) for x in all_dbs], reverse=True)

    # Use the largest as Primary
    head = sorted_dbs[0][1]
    rst = sorted_dbs[1:]
    head_set = {x['ID'] for x in head.entries}
    missing_keys = set([])

    # For remaining, get entries that are missing
    for _, db in rst:
        db_set = {x['ID'] for x in db.entries}
        if head_set.issuperset(db_set):
            continue

        missing_keys.update(db_set.difference(head_set))

    logging.info("{} Keys missing from master: {}".format(
        len(missing_keys), "\n".join(missing_keys)))
Ejemplo n.º 15
0
def main():
    args = parser.parse_args()

    args.output = abspath(expanduser(args.output))
    assert (exists(args.target))

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    # Load targets
    bib_files = retrieval.get_data_files(args.target, ".bib")
    db = BU.parse_bib_files(bib_files, func=bib_proc.author_extract)
    logging.info("Bibtex loaded")

    logging.info(f"Processing Entries: {len(db.entries)}")
    result = process_db(db)

    # write the output
    with open(args.output, 'w') as f:
        f.write("\n".join(result))

    logging.info("Complete")
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="\n".join([""]))
    parser.add_argument('--aBool', action="store_true")
    parser.add_argument('--target', append=True, required=True)
    parser.add_argument('--tags', required=True)
    parser.add_argument('--out', required=True)

    args = parser.parse_args()
    args.out = abspath(expanduser(args.out))
    args.tags = abspath(expanduser(args.tags))

    found = set(get_data_files(args.target, ".org"))
    tag_index = IndexFile.builder(args.tags)

    for tag, mentioned in tag_index.items():
        found.difference_update(mentioned)

    # Write report
    with open(args.out) as f:
        f.write("\n".join(sorted(found)))
Ejemplo n.º 17
0
def main():
    args = parser.parse_args()
    args.library = [abspath(expanduser(x)) for x in args.library]
    args.output = abspath(expanduser(args.output))

    all_bibs = retrieval.get_data_files(args.library, ".bib")
    logging.info("Found {} bib files".format(len(all_bibs)))
    db = b.bibdatabase.BibDatabase()
    BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db)
    logging.info("Loaded bibtex entries: {}".format(len(db.entries)))

    # Graph tags over time
    year_counts = []
    if args.tag:
        year_counts = get_tag_across_years(db, args.tag)
    else:
        year_counts: List[Tuple[datetime, int]] = get_entries_across_years(db)

    year_counts = [x for x in year_counts if x[1] > 5]
    # chart the tweets
    to_draw = [("Years", year_counts)]
    # n_rows = int(len(to_draw) / 2)
    # n_cols = int(len(to_draw) / 2)
    n_rows = 1
    n_cols = 1
    for count, paired in enumerate(to_draw):
        name, data = paired
        logging.info("Drawing {}".format(name))
        x = [x[0] for x in data]
        y = [x[1] for x in data]
        plt.subplot(n_rows, n_cols, count + 1)
        plt.scatter(x, y, alpha=0.3)
        plt.title(name)
        plt.gcf().autofmt_xdate()

    logging.info("Finished, saving")
    plt.savefig(args.output)
    plt.show()
Ejemplo n.º 18
0
def main():
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    #load each of the specified files
    target_files = retrieval.get_data_files(args.target, ".bib")
    dbs = [BU.parse_bib_files(x, bib_proc.nop) for x in target_files]

    main_db = b.bibdatabase.BibDatabase()
    # Load the main database
    if exists(args.output):
        BU.parse_bib_files(args.output, bib_proc.nop, database=main_db)

    main_set = set(main_db.get_entry_dict().keys())
    total_entries = main_db.entries[:]
    missing_keys_main = set()

    # Get entries missing from the main database
    for db in dbs:
        db_dict = db.get_entry_dict()
        db_set = set(db_dict.keys())
        missing_keys = db_set.difference(main_set)
        missing_keys_main.update(missing_keys)
        total_entries += [db_dict[x] for x in missing_keys]

    logging.info("{} missing entries".format(len(total_entries)))
    main_db.entries = total_entries

    # Write out the combined database
    logging.info("Bibtex loaded")
    writer = BibTexWriter()
    writer.align_values = True
    with open(join(args.output), 'a') as f:
        f.write(writer.write(main_db))
Ejemplo n.º 19
0
def main():
    args = parser.parse_args()

    args.output = abspath(expanduser(args.output))
    assert (exists(args.target))

    logging.info("Targeting: {}".format(args.target))
    logging.info("Output to: {}".format(args.output))

    # Load targets
    bib_files = retrieval.get_data_files(args.target, ".bib")
    db = BU.parse_bib_files(bib_files, func=bib_proc.clean_full)
    logging.info("Bibtex loaded")

    logging.info(f"Processing Entries: {len(db.entries)}")
    result = process_db(db)
    logging.info("Processing complete")

    with open(join(args.output, "bibtex.years"), 'w') as f:
        f.write(str(result.all_years))

    with open(join(args.output, "bibtex.authors"), 'w') as f:
        f.write(str(result.author_counts))

    with open(join(args.output, "bibtex.no_file"), 'w') as f:
        f.write("\n".join(result.no_file))

    with open(join(args.output, "bibtex.missing_file"), 'w') as f:
        f.write("\n".join(result.missing_files))

    with open(join(args.output, "bibtex.duplicates"), 'w') as f:
        f.write("\n".join(result.duplicates))

    with open(join(args.output, "bibtex.untagged"), 'w') as f:
        f.write("\n".join(result.non_tagged))

    logging.info("Complete")
Ejemplo n.º 20
0
console = root_logger.StreamHandler()
console.setLevel(root_logger.INFO)
root_logger.getLogger('').addHandler(console)
logging = root_logger.getLogger(__name__)

FILE_TYPES = [".gif", ".jpg", ".jpeg", ".png", ".mp4", ".bmp"]

#see https://docs.python.org/3/howto/argparse.html
parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="\n".join(["Find and Hash images, revealing duplicates"]))
parser.add_argument('-t', '--target', action='append', required=True)
parser.add_argument('-o', '--output', required=True)

if __name__ == "__main__":
    logging.info("Starting Photo Description")
    args = parser.parse_args()
    logging.info("Finding images")
    images = retrieval.get_data_files(args.target, FILE_TYPES)
    logging.info("Hashing {} images".format(len(images)))
    hash_dict, conflicts = hash_check.hash_all(images)
    logging.info("Hashed all images, {} conflicts".format(len(conflicts)))

    #write conflicts to an org file:
    with open(expanduser(args.output), 'w') as f:
        f.write("* Conflicts\n")
        for x in conflicts:
            f.write("** {}\n".format(x))
            f.write("\n".join(["   [[{}]]".format(y) for y in hash_dict[x]]))
Ejemplo n.º 21
0
parser.add_argument('-l', '--library', required=True)
parser.add_argument('-o', '--output', required=True)
parser.add_argument('-d', '--domain', action="store_true"


if __name__ == "__main__":
    args         = parser.parse_args()
    args.library = abspath(expanduser(args.library))
    args.output  = abspath(expanduser(args.output))

    assert(exists(args.library))

    # Load the library
    logging.info("Loading Library")
    library   = BookmarkCollection()
    lib_files = retrieval.get_data_files(args.library, ".html")
    for lib_f in lib_files:
        with open(lib_f, 'r') as f:
            library.add_file(f)

    domains = TagFile()

    # Process library for tags and domains
    logging.info("Processing Library")
    for bkmk in library:
        # Count websites
        parsed = bkmk.url_comps
        domains.inc(parsed.netloc)

    logging.info("Domain Counts")
    with open('{}.domain_counts'.format(args.output), 'w') as f:
Ejemplo n.º 22
0
#see https://docs.python.org/3/howto/argparse.html
parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="\n".join(["Find and Hash images, revealing duplicates"]))
parser.add_argument('-l', '--library', action="append", required=True)
parser.add_argument('-t', '--target', action="append", required=True)
parser.add_argument('-c', '--copy', action="store_true")
parser.add_argument('-o', '--output', required=True)

if __name__ == "__main__":
    logging.info("Starting Photo Description")

    args = parser.parse_args()

    logging.info("Finding library images")
    library_images = retrieval.get_data_files(args.library,
                                              retrieval.img_and_video)
    logging.info("Finding target images")
    target_images = retrieval.get_data_files(args.target,
                                             retrieval.img_and_video)
    logging.info("Finding missing images")
    missing = hash_check.find_missing(library_images, target_images)
    logging.info("Found {} missing images".format(len(missing)))

    #write conflicts to an org file:
    if not args.copy:
        count = 0
        grouping = int(len(missing) / 100)
        with open(expanduser(args.output), 'w') as f:
            f.write("* Missing\n")
            for i, x in enumerate(missing):
                if (i % grouping) == 0:
Ejemplo n.º 23
0
)
parser.add_argument('-s', '--source', action="append", required=True)
parser.add_argument('-q', '--query', default=None)
parser.add_argument('-o', '--output', required=True)


query_re = re.compile(r'\*+\s+\(\d+\) (.+)$')

if __name__ == "__main__":
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))
    if args.query is not None:
        args.query = abspath(expanduser(args.query))

    #load any sources
    source_files = retrieval.get_data_files(args.source, ".bookmarks")
    total = BookmarkCollection()
    for bkmk_f in source_files:
        with open(bkmk_f, 'r') as f:
            total.add_file(f)

    #filter any queries
    if args.query:
        with open(args.query, 'r') as f:
            lines = f.readlines()
        matches = [query_re.findall(x) for x in lines]
        queries = set([x[0] for x in matches if x])
        # TODO total.filter_queries(queries)

    #export
    with open(args.output, "w") as f:
Ejemplo n.º 24
0
def main():
    args = parser.parse_args()

    args.output = abspath(expanduser(args.output))
    assert(isdir(args.library))
    assert(isdir(args.output))
    assert(isdir(args.target))

    # Get targets
    all_bibs = retrieval.get_data_files(args.library, ".bib")
    main_db = BU.parse_bib_files(all_bibs)

    logging.info("Loaded Database: {} entries".format(len(main_db.entries)))
    count              = 0
    all_file_mentions  = []
    all_existing_files = retrieval.get_data_files(args.target, [".epub", ".pdf"], normalize=True)

    # Convert entries to unicode
    for i, entry in enumerate(main_db.entries):
        if i % 10 == 0:
            logging.info("{}/10 Complete".format(count))
            count += 1
        unicode_entry = b.customization.convert_to_unicode(entry)

        entry_keys = [x for x in unicode_entry.keys() if FILE_RE.search(x)]
        for k in entry_keys:
            all_file_mentions.append(normalize('NFD', unicode_entry[k]))


    logging.info("Found {} files mentioned in bibliography".format(len(all_file_mentions)))
    logging.info("Found {} files existing".format(len(all_existing_files)))

    logging.info("Normalizing paths")
    norm_mentions = set([])
    # Normalise all paths in bibtex entries
    for x in all_file_mentions:
        path = PATH_NORM.sub("", x)
        if path in norm_mentions:
            logging.info("Duplicate file mention: {}".format(path))
        else:
            norm_mentions.add(path)

    norm_existing = set([])
    # Remove duplicates mentions
    for x in all_existing_files:
        path = PATH_NORM.sub("", x)
        if path in norm_existing:
            logging.info("Duplicate file existence: {}".format(path))
        else:
            norm_existing.add(path)

    logging.info("Normalized paths")

    mentioned_non_existent = norm_mentions - norm_existing
    existing_not_mentioned = norm_existing - norm_mentions

    logging.info("Mentioned but not existing: {}".format(len(mentioned_non_existent)))
    logging.info("Existing but not mentioned: {}".format(len(existing_not_mentioned)))

    # Create output files
    with open(join(args.output, "bibtex.not_existing"),'w') as f:
        f.write("\n".join(mentioned_non_existent))

    with open(join(args.output, "bibtex.not_mentioned"), 'w') as f:
        f.write("\n".join(existing_not_mentioned))
Ejemplo n.º 25
0
def convert_to_day_counts(tweets):
    days = {x : 0 for x in range(31)}
    for tweet in tweets:
        day = tweet[0].day - 1
        days[day] += 1

    return sorted([(x[0], x[1]) for x in days.items()], key=lambda x: x[0])



if __name__ == "__main__":
    args = parser.parse_args()
    args.library = [abspath(expanduser(x)) for x in args.library]
    args.target = abspath(expanduser(args.target))

    all_orgs = retrieval.get_data_files(args.library, ".org")

    logging.info("Found {} org files".format(len(all_orgs)))

    # Process tweets
    all_tweets : List[Tuple[datetime, str]] = get_tweet_dates_and_ids(all_orgs)
    logging.info("Found {} tweets".format(len(all_tweets)))
    # remove duplicates and convert date strings
    tweet_dict = {x[0] : convert_tweet_date(x[1]) for x in all_tweets}

    logging.info("Sorting {} tweets".format(len(tweet_dict)))
    ordered = sorted([(x[1], x[0]) for x in tweet_dict.items()], key=lambda x: x[1])

    id_convertor = {x : i for i,x in enumerate(tweet_dict.keys())}

    # Convert to 24 hour time only
Ejemplo n.º 26
0
parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="\n".join([
        "For a bookmark file", "Create an org file of paired links",
        "which compare the original link",
        "with the link minus an html parameter"
    ]))
parser.add_argument('-l', '--library', required=True)
parser.add_argument('-o', '--output', required=True)

if __name__ == "__main__":
    args = parser.parse_args()
    args.library = abspath(expanduser(args.library))
    args.output = abspath(expanduser(args.output))

    assert (exists(args.library))

    # Load the library
    logging.info("Loading Library")
    lib_files = retrieval.get_data_files(args.library, ".bookmarks")
    library = BookmarkCollection()
    for bkmk_f in lib_files:
        with open(bkmk_f, 'r') as f:
            library.add_file(f)

    logging.info("Processing Library")
    # TODO Generate org file
    # org_str = the_trie.org_format_queries()
    # with open("{}.org".format(args.output), 'w') as f:
    # f.write(org_str)
import argparse
# Setup root_logger:
import logging as root_logger
from os import listdir
from os.path import (abspath, exists, expanduser, isdir, isfile, join, split,
                     splitext)
from subprocess import call

from bkmkorg.utils.pdf import pdf as PU
from bkmkorg.utils.dfs import files as retrieval

LOGLEVEL = root_logger.DEBUG
LOG_FILE_NAME = "log.{}".format(splitext(split(__file__)[1])[0])
root_logger.basicConfig(filename=LOG_FILE_NAME, level=LOGLEVEL, filemode='w')

console = root_logger.StreamHandler()
console.setLevel(root_logger.INFO)
root_logger.getLogger('').addHandler(console)
logging = root_logger.getLogger(__name__)
##############################
parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="\n".join([""]))
parser.add_argument('-l', '--library', required=True)
##############################

if __name__ == "__main__":
    args = parser.parse_args()
    files = retrieval.get_data_files(args.library, ".pdf")
    PU.convert_pdfs_to_text(files)
Ejemplo n.º 28
0
LOG_FILE_NAME = "log.{}".format(splitext(split(__file__)[1])[0])
root_logger.basicConfig(filename=LOG_FILE_NAME, level=LOGLEVEL, filemode='w')

console = root_logger.StreamHandler()
console.setLevel(root_logger.INFO)
root_logger.getLogger('').addHandler(console)
logging = root_logger.getLogger(__name__)
##############################
#see https://docs.python.org/3/howto/argparse.html
parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="\n".join([""]))
parser.add_argument('--target', action="append", required=True)
parser.add_argument('--output', required=True)

if __name__ == "__main__":
    args = parser.parse_args()
    args.output = abspath(expanduser(args.output))

    # Find all pdfs in subdir
    pdfs_to_process = retrieval.get_data_files(args.target, ".pdf")
    logging.info("Merging {} pdfs".format(len(pdfs_to_process)))
    PU.merge_pdfs(pdfs_to_process, output=args.output)

    # writer.trailer.Info = IndirectPdfDict(
    #     Title='your title goes here',
    #     Author='your name goes here',
    #     Subject='what is it all about?',
    #     Creator='some script goes here',
    # )
                    '--library',
                    default="~/github/writing/other_files/main_bookmarks.html")
parser.add_argument('-s', '--source', action='append', required=True)
parser.add_argument('-o',
                    '--output',
                    default="~/Desktop/missing_bookmarks.html")

if __name__ == "__main__":
    args = parser.parse_args()
    args.library = abspath(expanduser(args.library))
    args.source = [abspath(expanduser(x)) for x in args.source]
    args.output = abspath(expanduser(args.output))
    logging.info("Finding Links missing from: {}".format(args.library))

    # Get sources
    sources = retrieval.get_data_files(args.source, [".bookmarks"])
    logging.info("Using Source: {}".format(sources))

    #Load Library
    library_files = retrieval.get_data_files(args.library, ".bookmarks")
    library: BookmarkCollection = BookmarkCollection()
    for bkmk_f in library_files:
        with open(bkmk_f, 'r') as f:
            library.add_file(f)

    to_check = BookmarkColleciton()
    #Load each specified source file
    for x in sources:
        with open(x, 'r') as f:
            to_check.add_file(f)
Ejemplo n.º 30
0
def construct_user_summaries(component_dir, combined_threads_dir, total_users):
    """ collate threads together by originating user """
    logging.info("Constructing summaries\n\tfrom: {} \n\tto: {}".format(
        component_dir, combined_threads_dir))
    user_lookup = total_users
    # Create final orgs, grouped by head user
    components = get_data_files(component_dir, ext=".json")
    for comp in components:
        logging.info("Constructing Summary for: {}".format(comp))
        # read comp
        with open(comp, 'r') as f:
            data = json.load(f, strict=False)

        if not bool(data):
            logging.warning("No Data found in {comp}")
            continue

        # Get leaves
        tweets = {x['id_str']: x for x in data}
        user_counts = defaultdict(lambda: 0)
        for x in data:
            user_counts[x['user']['id_str']] += 1

        head_user = max(user_counts.items(), key=lambda x: x[1])[0]
        screen_name = str(head_user)
        if head_user in user_lookup:
            screen_name = user_lookup[head_user]['screen_name']

        logging.debug("Constructing graph")
        graph = nx.DiGraph()
        quotes = set()
        roots = set()
        for tweet in data:
            if tweet['in_reply_to_status_id_str'] is not None:
                graph.add_edge(tweet['in_reply_to_status_id_str'],
                               tweet['id_str'])
            else:
                graph.add_node(tweet['id_str'])
                roots.add(tweet['id_str'])

            if 'quoted_status_id_str' in tweet and tweet[
                    'quoted_status_id_str'] is not None:
                quotes.add(tweet['quoted_status_id_str'])

        # dfs to get longest chain
        chains = []

        if bool(roots):
            chains = dfs_chains(graph, roots)

        if not bool(chains):
            chains = [list(roots.union(quotes))]

        # Assign main thread as the longest chain
        main_thread = max(chains, key=lambda x: len(x))
        main_set = set(main_thread)
        main_index = chains.index(main_thread)

        # assign secondary conversations
        rest = chains[:main_index] + chains[main_index + 1:]

        rest = [x for x in rest if bool(x)]
        # Remove duplications
        cleaned_rest = []
        for thread in rest:
            cleaned = [x for x in thread if x not in main_set]
            cleaned_rest.append(cleaned)
            main_set.update(cleaned)

        # create accessor to summary file
        summary = TwitterUserSummary(screen_name, combined_threads_dir)

        if summary.user is None:
            if head_user in user_lookup:
                summary.set_user(user_lookup[head_user])
            else:
                summary.set_user(screen_name)

        summary.add_thread(main_thread, cleaned_rest, quotes, tweets)

        # write out user file
        summary.write()