def main(): # Setup args = parser.parse_args() args.source = [abspath(expanduser(x)) for x in args.source] args.library = [abspath(expanduser(x)) for x in args.library] if args.exclude is None: args.exclude = [] args.exclude = [abspath(expanduser(x)) for x in args.exclude] if not args.record: args.record = join(args.library[0], "update_record") logging.info(f"Update Record: {args.record}") assert (exists(args.record)) if any([not exists(x) for x in args.source + args.library]): raise Exception('Source and Output need to exist') #load the newly parsed org names # { file_name : full_path } newly_parsed = sorted(retrieval.get_data_files(args.source, ext=".org")) logging.info("Newly parsed to transfer: {}".format(len(newly_parsed))) #get the existing org names, as a dict with its location library_orgs = retrieval.get_data_files(args.library, ext=".org") existing_orgs = {} for lib_org in library_orgs: if lib_org in args.exclude: continue existing_orgs[split(lib_org)[1]] = split(lib_org)[0] logging.info("Existing orgs: {}".format(len(existing_orgs))) totally_new = [] #now update existing with the new for x in newly_parsed: if split(x)[1] not in existing_orgs: logging.info("Found a completely new user: {}".format(x)) totally_new.append(x) continue integrate(x, existing_orgs) logging.info("Completely new to transfer: {}".format(len(totally_new))) # Now copy completely new files for x in sorted(totally_new): copy_new(x, args.library[0]) update_record(args.record, args.source) system('say -v Moira -r 50 "Finished Integrating"')
def build(json_dir): """ Create a graph of tweet replies and quotes """ logging.info("Assembling threads graph from: {}".format(json_dir)) json_files = get_data_files(json_dir, ext=".json") di_graph = nx.DiGraph() for jfile in json_files: # load in each json, with open(jfile, 'r') as f: data = json.load(f, strict=False) # construct connection graph for entry in data: # get tweet id, reply_id, quote_id tweet_id = entry['id_str'] di_graph.add_node(tweet_id, source_file=jfile) if 'in_reply_to_status_id_str' in entry and entry[ 'in_reply_to_status_id_str']: # link tweets di_graph.add_edge(tweet_id, str(entry['in_reply_to_status_id_str']), type="reply") if 'quoted_status_id_str' in entry and entry[ 'quoted_status_id_str']: di_graph.add_edge(tweet_id, str(entry['quoted_status_id_str']), type="quote") return TwitterGraph(di_graph)
def main(): logging.info("---------- STARTING Tag Indexer") args = parser.parse_args() args.target = [abspath(expanduser(x)) for x in args.target] args.output = abspath(expanduser(args.output)) targets = get_data_files(args.target, ext=".org") index = IndexFile() for filename in targets: # read in lines = [] with open(filename, 'r') as f: lines = f.readlines() # headlines with tags matched = [TAG_LINE.match(x) for x in lines] actual = [x for x in matched if bool(x)] tags = [y for x in actual for y in x[1].split(":") if bool(x)] # add to index for tag in tags: index.add_files(tag, [filename]) # Write out index out_string = str(index) with open(args.output, 'w') as f: f.write(out_string) logging.info("Tag Indexing Finished")
def main(): args = parser.parse_args() if args.output: error_out = expander(join(split(args.output)[0], ".bib_errors")) logging.info("---------- STARTING Bibtex Clean") logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) bib_files = retrieval.get_data_files(args.target, ".bib") db = BU.parse_bib_files(bib_files, func=custom_clean) logging.info("Read %s entries", len(db.entries)) #Get errors and write them out: error_tuples = ERRORS if bool(error_tuples) and args.output: formatted = "\n".join( ["{} : {}".format(x, y) for x, y in error_tuples]) + "\n" with open(error_out, 'a') as f: f.write(formatted) # Write out the actual bibtex if args.output: logging.info("Writing out Cleaned Bibliography") writer = JGBibTexWriter() out_str = writer.write(db) with open(args.output, 'w') as f: f.write(out_str)
def main(): logging.info("---------- STARTING Twitter Indexer") args = parser.parse_args() args.target = [abspath(expanduser(x)) for x in args.target] args.output = abspath(expanduser(args.output)) targets = get_data_files(args.target, ext=".org") index = IndexFile() for filename in targets: # read in lines = [] with open(filename, 'r') as f: lines = f.readlines() # PERMALINK matched = [PERMALINK.match(x) for x in lines] users = ["@"+x[1] for x in matched if bool(x)] # add to index for user in users: index.add_files(user, [filename]) # Write out index out_string = str(index) with open(args.output, 'w') as f: f.write(out_string) logging.info("Twitter Indexing Finished")
def main(): logging.info("Grepping for Tags") args = parser.parse_args() args.output = abspath(expanduser(args.output)) # Collect files to process lib = retrieval.get_data_files(args.library, ext=".org") # Get tag set tags = TagFile.builder(args.target) batch_count = int(len(lib) / args.file_batch) processed_tags = TagFile.builder(args.output, ext=".index").to_set() # fail out if a lock file exists if exists(args.output + ".lock"): logging.warning("Lock File Exists") sys.exit() open(args.output + ".lock", 'w').close() assert (exists(args.output + ".lock")) remaining_keys = list(set(tags.count.keys()).difference(processed_tags)) logging.info( f"Total/Processed/Remaining: {len(tags)}/{len(processed_tags)}/{len(remaining_keys)}" ) logging.debug(f"Processed: {processed_tags}") for i, tag in enumerate(remaining_keys[:args.tag_batch]): index_additions = IndexFile() ## batch filter files that mention the tag logging.info(f"-- Tag: {tag} {i}/{len(tags)}") batch_num = 0 for start in range(0, len(lib), args.file_batch): logging.info(f"File Batch: {batch_num}/{batch_count}") result = run( ['grep', '-l', tag, *lib[start:start + args.file_batch]], capture_output=True) if result.returncode == 0 and bool(result.stdout): to_add: List = [ x.strip() for x in result.stdout.decode().split("\n") ] shortened = [ x[len(args.target[0]):] if args.target[0] in x else x for x in to_add ] index_additions.add_files(tag, shortened) batch_num += 1 # add new tag->file mappings to the index if bool(index_additions): logging.info(f"Writing to file: {len(index_additions)}") with open(args.output, 'a') as f: f.write("\n") f.write(str(index_additions)) remove(args.output + ".lock") logging.info("Finished")
def builder(cls, target, ext=None) -> 'BaseFileFormat': """ Build an tag file from a target directory or file """ main = cls() ext = ext or main.ext for t in get_data_files(target, ext): main += cls.read(t) return main
def builder(target, sep=None) -> IndexFile: """ Build an index file from a target directory or file """ main = IndexFile() for target in get_data_files(target, main.ext): try: main += IndexFile.read(target, sep=sep) except Exception as err: logging.warning(f"IndexFile.builder failure for {target}") return main
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) # TODO, get information from bibtex on each entry, including specific pages if args.grouped: groups = listdir(args.target) for group in groups: pdfs_to_process = retrieval.get_data_files( join(args.target, group), [".pdf", ".epub"]) logging.info("Summarising {}'s {} pdfs".format( group, len(pdfs_to_process))) PU.summarise_pdfs(pdfs_to_process, output="{}_{}".format(args.output, group), bound=int(args.bound)) else: # Find all pdfs in subdir pdfs_to_process = retrieval.get_data_files(args.target, ".pdf") logging.info("Summarising {} pdfs".format(len(pdfs_to_process))) PU.summarise_pdfs(pdfs_to_process, output=args.output, bound=args.bound)
def construct_org_files(combined_threads_dir, org_dir, all_users, todo_tag_bindings: TweetTodoFile): logging.info("Constructing org files from: {} \n\tto: {}".format( combined_threads_dir, org_dir)) # get all user summary jsons user_summaries = get_data_files(combined_threads_dir, ext=".json") for summary in user_summaries: org_obj = TwitterOrg(summary, org_dir, todo_tag_bindings, all_users) if not bool(org_obj): logging.warning(f"User Summary Empty: {summary}") continue org_obj.build_threads() org_obj.write() org_obj.download_media()
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) output_path = join(args.output, "{}_summary".format(args.tag)) bibtex_files = retrieval.get_data_files(args.target, ".bib") db = b.bibdatabase.BibDatabase() BU.parse_bib_files(bibtex_files, func=bib_proc.tag_summary, database=db) entries_with_tag = [x for x in db.entries if args.tag in x['tags']] entries_by_year = sorted(entries_with_tag, key=lambda x: x['year']) pdfs_to_process = [x['file'] for x in entries_by_year] expanded_paths = [abspath(expanduser(x)) for x in pdfs_to_process] logging.info("Summarising {} pdfs".format(len(expanded_paths))) PU.summarise_pdfs(expanded_paths, output=output_path, bound=args.bound)
def main(): args = parser.parse_args() args.library = [abspath(expanduser(x)) for x in args.library] args.output = abspath(expanduser(args.output)) logging.info("---------- STARTING Bibtex Timelines") if not exists(args.output): logging.info("Making output: {}".format(args.output)) mkdir(args.output) assert (exists(args.output)) all_bibs = retrieval.get_data_files(args.library, ".bib") logging.info("Found {} bib files".format(len(all_bibs))) db = b.bibdatabase.BibDatabase() BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db) logging.info("Loaded bibtex entries: {}".format(len(db.entries))) # Load totals_bib.tags # Filter for min_entries # Create a TimelineFile for each tag # Add citations to each tag TimelineFile # Write out timeline files tag_collection = defaultdict(list) for entry in db.entries: tags = entry['tags'] for tag in tags: tag_collection[tag].append(entry) logging.info("Collected Tags: {}".format(len(tag_collection))) # Then sort by year and write out for tag, entries in tag_collection.items(): out_target = join(args.output, "{}.tag_timeline".format(tag)) sorted_entries = sorted(entries, key=lambda x: x['year']) if len(sorted_entries) > args.min_entries: with open(out_target, 'w') as f: f.write("\n".join([ "{} {}".format(x['year'].strftime("%Y"), x['ID']) for x in sorted_entries ]))
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert(isdir(args.output) or not exists(args.output)) if not exists(args.output): mkdir(args.output) if not isdir(args.output): mkdir(args.output) # load source lib_files = get_data_files(args.source, ext=".bookmarks") library = BookmarkCollection() for bkmk_f in lib_files: with open(bkmk_f, 'r') as f: library.add_file(f) domains = defaultdict(lambda: []) # Group urls into domains for bkmk in library: parsed = urlparse(bkmk.url) netloc = CLEAN.sub("", parsed.netloc) if "github" in netloc: domains["github"].append(bkmk) elif "itch.io" in netloc: domains["itchio"].append(bkmk) else: domains[netloc].append(bkmk) logging.info(f"Grouped into {len(domains)} domains") #save groups = "\n".join(domains.keys()) with open(join(args.output, "netlocs.list"), 'w') as f: f.write(groups) for domain, bkmks in domains.items(): flattened = domain.replace(".", "_") bkmks_s = "\n".join([str(x) for x in sorted(bkmks)]) with open(join(args.output, f"{flattened}.bookmarks"), 'w') as f: f.write(bkmks_s)
def main(): args = parser.parse_args() args.target = [abspath(expanduser(x)) for x in args.target] logging.info("Targeting: {}".format(args.target)) # Get Bibtex files all_bib_paths = retrieval.get_data_files(args.targeet, ".bib") all_dbs = [] for t in all_bib_paths: # Use a new bib_parser for each so library isn't shared bib_parser = BibTexParser(common_strings=False) bib_parser.ignore_nonstandard_types = False bib_parser.homogenise_fields = True with open(t, 'r') as f: db = b.load(f, bib_parser) all_dbs.append(db) logging.info("DB Sizes: {}".format(", ".join( [str(len(x.entries)) for x in all_dbs]))) # Sort the bibtex's by their size sorted_dbs = sorted([(len(x.entries), x) for x in all_dbs], reverse=True) # Use the largest as Primary head = sorted_dbs[0][1] rst = sorted_dbs[1:] head_set = {x['ID'] for x in head.entries} missing_keys = set([]) # For remaining, get entries that are missing for _, db in rst: db_set = {x['ID'] for x in db.entries} if head_set.issuperset(db_set): continue missing_keys.update(db_set.difference(head_set)) logging.info("{} Keys missing from master: {}".format( len(missing_keys), "\n".join(missing_keys)))
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert (exists(args.target)) logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) # Load targets bib_files = retrieval.get_data_files(args.target, ".bib") db = BU.parse_bib_files(bib_files, func=bib_proc.author_extract) logging.info("Bibtex loaded") logging.info(f"Processing Entries: {len(db.entries)}") result = process_db(db) # write the output with open(args.output, 'w') as f: f.write("\n".join(result)) logging.info("Complete")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog="\n".join([""])) parser.add_argument('--aBool', action="store_true") parser.add_argument('--target', append=True, required=True) parser.add_argument('--tags', required=True) parser.add_argument('--out', required=True) args = parser.parse_args() args.out = abspath(expanduser(args.out)) args.tags = abspath(expanduser(args.tags)) found = set(get_data_files(args.target, ".org")) tag_index = IndexFile.builder(args.tags) for tag, mentioned in tag_index.items(): found.difference_update(mentioned) # Write report with open(args.out) as f: f.write("\n".join(sorted(found)))
def main(): args = parser.parse_args() args.library = [abspath(expanduser(x)) for x in args.library] args.output = abspath(expanduser(args.output)) all_bibs = retrieval.get_data_files(args.library, ".bib") logging.info("Found {} bib files".format(len(all_bibs))) db = b.bibdatabase.BibDatabase() BU.parse_bib_files(all_bibs, func=bib_proc.year_parse, database=db) logging.info("Loaded bibtex entries: {}".format(len(db.entries))) # Graph tags over time year_counts = [] if args.tag: year_counts = get_tag_across_years(db, args.tag) else: year_counts: List[Tuple[datetime, int]] = get_entries_across_years(db) year_counts = [x for x in year_counts if x[1] > 5] # chart the tweets to_draw = [("Years", year_counts)] # n_rows = int(len(to_draw) / 2) # n_cols = int(len(to_draw) / 2) n_rows = 1 n_cols = 1 for count, paired in enumerate(to_draw): name, data = paired logging.info("Drawing {}".format(name)) x = [x[0] for x in data] y = [x[1] for x in data] plt.subplot(n_rows, n_cols, count + 1) plt.scatter(x, y, alpha=0.3) plt.title(name) plt.gcf().autofmt_xdate() logging.info("Finished, saving") plt.savefig(args.output) plt.show()
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) #load each of the specified files target_files = retrieval.get_data_files(args.target, ".bib") dbs = [BU.parse_bib_files(x, bib_proc.nop) for x in target_files] main_db = b.bibdatabase.BibDatabase() # Load the main database if exists(args.output): BU.parse_bib_files(args.output, bib_proc.nop, database=main_db) main_set = set(main_db.get_entry_dict().keys()) total_entries = main_db.entries[:] missing_keys_main = set() # Get entries missing from the main database for db in dbs: db_dict = db.get_entry_dict() db_set = set(db_dict.keys()) missing_keys = db_set.difference(main_set) missing_keys_main.update(missing_keys) total_entries += [db_dict[x] for x in missing_keys] logging.info("{} missing entries".format(len(total_entries))) main_db.entries = total_entries # Write out the combined database logging.info("Bibtex loaded") writer = BibTexWriter() writer.align_values = True with open(join(args.output), 'a') as f: f.write(writer.write(main_db))
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert (exists(args.target)) logging.info("Targeting: {}".format(args.target)) logging.info("Output to: {}".format(args.output)) # Load targets bib_files = retrieval.get_data_files(args.target, ".bib") db = BU.parse_bib_files(bib_files, func=bib_proc.clean_full) logging.info("Bibtex loaded") logging.info(f"Processing Entries: {len(db.entries)}") result = process_db(db) logging.info("Processing complete") with open(join(args.output, "bibtex.years"), 'w') as f: f.write(str(result.all_years)) with open(join(args.output, "bibtex.authors"), 'w') as f: f.write(str(result.author_counts)) with open(join(args.output, "bibtex.no_file"), 'w') as f: f.write("\n".join(result.no_file)) with open(join(args.output, "bibtex.missing_file"), 'w') as f: f.write("\n".join(result.missing_files)) with open(join(args.output, "bibtex.duplicates"), 'w') as f: f.write("\n".join(result.duplicates)) with open(join(args.output, "bibtex.untagged"), 'w') as f: f.write("\n".join(result.non_tagged)) logging.info("Complete")
console = root_logger.StreamHandler() console.setLevel(root_logger.INFO) root_logger.getLogger('').addHandler(console) logging = root_logger.getLogger(__name__) FILE_TYPES = [".gif", ".jpg", ".jpeg", ".png", ".mp4", ".bmp"] #see https://docs.python.org/3/howto/argparse.html parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog="\n".join(["Find and Hash images, revealing duplicates"])) parser.add_argument('-t', '--target', action='append', required=True) parser.add_argument('-o', '--output', required=True) if __name__ == "__main__": logging.info("Starting Photo Description") args = parser.parse_args() logging.info("Finding images") images = retrieval.get_data_files(args.target, FILE_TYPES) logging.info("Hashing {} images".format(len(images))) hash_dict, conflicts = hash_check.hash_all(images) logging.info("Hashed all images, {} conflicts".format(len(conflicts))) #write conflicts to an org file: with open(expanduser(args.output), 'w') as f: f.write("* Conflicts\n") for x in conflicts: f.write("** {}\n".format(x)) f.write("\n".join([" [[{}]]".format(y) for y in hash_dict[x]]))
parser.add_argument('-l', '--library', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-d', '--domain', action="store_true" if __name__ == "__main__": args = parser.parse_args() args.library = abspath(expanduser(args.library)) args.output = abspath(expanduser(args.output)) assert(exists(args.library)) # Load the library logging.info("Loading Library") library = BookmarkCollection() lib_files = retrieval.get_data_files(args.library, ".html") for lib_f in lib_files: with open(lib_f, 'r') as f: library.add_file(f) domains = TagFile() # Process library for tags and domains logging.info("Processing Library") for bkmk in library: # Count websites parsed = bkmk.url_comps domains.inc(parsed.netloc) logging.info("Domain Counts") with open('{}.domain_counts'.format(args.output), 'w') as f:
#see https://docs.python.org/3/howto/argparse.html parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog="\n".join(["Find and Hash images, revealing duplicates"])) parser.add_argument('-l', '--library', action="append", required=True) parser.add_argument('-t', '--target', action="append", required=True) parser.add_argument('-c', '--copy', action="store_true") parser.add_argument('-o', '--output', required=True) if __name__ == "__main__": logging.info("Starting Photo Description") args = parser.parse_args() logging.info("Finding library images") library_images = retrieval.get_data_files(args.library, retrieval.img_and_video) logging.info("Finding target images") target_images = retrieval.get_data_files(args.target, retrieval.img_and_video) logging.info("Finding missing images") missing = hash_check.find_missing(library_images, target_images) logging.info("Found {} missing images".format(len(missing))) #write conflicts to an org file: if not args.copy: count = 0 grouping = int(len(missing) / 100) with open(expanduser(args.output), 'w') as f: f.write("* Missing\n") for i, x in enumerate(missing): if (i % grouping) == 0:
) parser.add_argument('-s', '--source', action="append", required=True) parser.add_argument('-q', '--query', default=None) parser.add_argument('-o', '--output', required=True) query_re = re.compile(r'\*+\s+\(\d+\) (.+)$') if __name__ == "__main__": args = parser.parse_args() args.output = abspath(expanduser(args.output)) if args.query is not None: args.query = abspath(expanduser(args.query)) #load any sources source_files = retrieval.get_data_files(args.source, ".bookmarks") total = BookmarkCollection() for bkmk_f in source_files: with open(bkmk_f, 'r') as f: total.add_file(f) #filter any queries if args.query: with open(args.query, 'r') as f: lines = f.readlines() matches = [query_re.findall(x) for x in lines] queries = set([x[0] for x in matches if x]) # TODO total.filter_queries(queries) #export with open(args.output, "w") as f:
def main(): args = parser.parse_args() args.output = abspath(expanduser(args.output)) assert(isdir(args.library)) assert(isdir(args.output)) assert(isdir(args.target)) # Get targets all_bibs = retrieval.get_data_files(args.library, ".bib") main_db = BU.parse_bib_files(all_bibs) logging.info("Loaded Database: {} entries".format(len(main_db.entries))) count = 0 all_file_mentions = [] all_existing_files = retrieval.get_data_files(args.target, [".epub", ".pdf"], normalize=True) # Convert entries to unicode for i, entry in enumerate(main_db.entries): if i % 10 == 0: logging.info("{}/10 Complete".format(count)) count += 1 unicode_entry = b.customization.convert_to_unicode(entry) entry_keys = [x for x in unicode_entry.keys() if FILE_RE.search(x)] for k in entry_keys: all_file_mentions.append(normalize('NFD', unicode_entry[k])) logging.info("Found {} files mentioned in bibliography".format(len(all_file_mentions))) logging.info("Found {} files existing".format(len(all_existing_files))) logging.info("Normalizing paths") norm_mentions = set([]) # Normalise all paths in bibtex entries for x in all_file_mentions: path = PATH_NORM.sub("", x) if path in norm_mentions: logging.info("Duplicate file mention: {}".format(path)) else: norm_mentions.add(path) norm_existing = set([]) # Remove duplicates mentions for x in all_existing_files: path = PATH_NORM.sub("", x) if path in norm_existing: logging.info("Duplicate file existence: {}".format(path)) else: norm_existing.add(path) logging.info("Normalized paths") mentioned_non_existent = norm_mentions - norm_existing existing_not_mentioned = norm_existing - norm_mentions logging.info("Mentioned but not existing: {}".format(len(mentioned_non_existent))) logging.info("Existing but not mentioned: {}".format(len(existing_not_mentioned))) # Create output files with open(join(args.output, "bibtex.not_existing"),'w') as f: f.write("\n".join(mentioned_non_existent)) with open(join(args.output, "bibtex.not_mentioned"), 'w') as f: f.write("\n".join(existing_not_mentioned))
def convert_to_day_counts(tweets): days = {x : 0 for x in range(31)} for tweet in tweets: day = tweet[0].day - 1 days[day] += 1 return sorted([(x[0], x[1]) for x in days.items()], key=lambda x: x[0]) if __name__ == "__main__": args = parser.parse_args() args.library = [abspath(expanduser(x)) for x in args.library] args.target = abspath(expanduser(args.target)) all_orgs = retrieval.get_data_files(args.library, ".org") logging.info("Found {} org files".format(len(all_orgs))) # Process tweets all_tweets : List[Tuple[datetime, str]] = get_tweet_dates_and_ids(all_orgs) logging.info("Found {} tweets".format(len(all_tweets))) # remove duplicates and convert date strings tweet_dict = {x[0] : convert_tweet_date(x[1]) for x in all_tweets} logging.info("Sorting {} tweets".format(len(tweet_dict))) ordered = sorted([(x[1], x[0]) for x in tweet_dict.items()], key=lambda x: x[1]) id_convertor = {x : i for i,x in enumerate(tweet_dict.keys())} # Convert to 24 hour time only
parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog="\n".join([ "For a bookmark file", "Create an org file of paired links", "which compare the original link", "with the link minus an html parameter" ])) parser.add_argument('-l', '--library', required=True) parser.add_argument('-o', '--output', required=True) if __name__ == "__main__": args = parser.parse_args() args.library = abspath(expanduser(args.library)) args.output = abspath(expanduser(args.output)) assert (exists(args.library)) # Load the library logging.info("Loading Library") lib_files = retrieval.get_data_files(args.library, ".bookmarks") library = BookmarkCollection() for bkmk_f in lib_files: with open(bkmk_f, 'r') as f: library.add_file(f) logging.info("Processing Library") # TODO Generate org file # org_str = the_trie.org_format_queries() # with open("{}.org".format(args.output), 'w') as f: # f.write(org_str)
import argparse # Setup root_logger: import logging as root_logger from os import listdir from os.path import (abspath, exists, expanduser, isdir, isfile, join, split, splitext) from subprocess import call from bkmkorg.utils.pdf import pdf as PU from bkmkorg.utils.dfs import files as retrieval LOGLEVEL = root_logger.DEBUG LOG_FILE_NAME = "log.{}".format(splitext(split(__file__)[1])[0]) root_logger.basicConfig(filename=LOG_FILE_NAME, level=LOGLEVEL, filemode='w') console = root_logger.StreamHandler() console.setLevel(root_logger.INFO) root_logger.getLogger('').addHandler(console) logging = root_logger.getLogger(__name__) ############################## parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog="\n".join([""])) parser.add_argument('-l', '--library', required=True) ############################## if __name__ == "__main__": args = parser.parse_args() files = retrieval.get_data_files(args.library, ".pdf") PU.convert_pdfs_to_text(files)
LOG_FILE_NAME = "log.{}".format(splitext(split(__file__)[1])[0]) root_logger.basicConfig(filename=LOG_FILE_NAME, level=LOGLEVEL, filemode='w') console = root_logger.StreamHandler() console.setLevel(root_logger.INFO) root_logger.getLogger('').addHandler(console) logging = root_logger.getLogger(__name__) ############################## #see https://docs.python.org/3/howto/argparse.html parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog="\n".join([""])) parser.add_argument('--target', action="append", required=True) parser.add_argument('--output', required=True) if __name__ == "__main__": args = parser.parse_args() args.output = abspath(expanduser(args.output)) # Find all pdfs in subdir pdfs_to_process = retrieval.get_data_files(args.target, ".pdf") logging.info("Merging {} pdfs".format(len(pdfs_to_process))) PU.merge_pdfs(pdfs_to_process, output=args.output) # writer.trailer.Info = IndirectPdfDict( # Title='your title goes here', # Author='your name goes here', # Subject='what is it all about?', # Creator='some script goes here', # )
'--library', default="~/github/writing/other_files/main_bookmarks.html") parser.add_argument('-s', '--source', action='append', required=True) parser.add_argument('-o', '--output', default="~/Desktop/missing_bookmarks.html") if __name__ == "__main__": args = parser.parse_args() args.library = abspath(expanduser(args.library)) args.source = [abspath(expanduser(x)) for x in args.source] args.output = abspath(expanduser(args.output)) logging.info("Finding Links missing from: {}".format(args.library)) # Get sources sources = retrieval.get_data_files(args.source, [".bookmarks"]) logging.info("Using Source: {}".format(sources)) #Load Library library_files = retrieval.get_data_files(args.library, ".bookmarks") library: BookmarkCollection = BookmarkCollection() for bkmk_f in library_files: with open(bkmk_f, 'r') as f: library.add_file(f) to_check = BookmarkColleciton() #Load each specified source file for x in sources: with open(x, 'r') as f: to_check.add_file(f)
def construct_user_summaries(component_dir, combined_threads_dir, total_users): """ collate threads together by originating user """ logging.info("Constructing summaries\n\tfrom: {} \n\tto: {}".format( component_dir, combined_threads_dir)) user_lookup = total_users # Create final orgs, grouped by head user components = get_data_files(component_dir, ext=".json") for comp in components: logging.info("Constructing Summary for: {}".format(comp)) # read comp with open(comp, 'r') as f: data = json.load(f, strict=False) if not bool(data): logging.warning("No Data found in {comp}") continue # Get leaves tweets = {x['id_str']: x for x in data} user_counts = defaultdict(lambda: 0) for x in data: user_counts[x['user']['id_str']] += 1 head_user = max(user_counts.items(), key=lambda x: x[1])[0] screen_name = str(head_user) if head_user in user_lookup: screen_name = user_lookup[head_user]['screen_name'] logging.debug("Constructing graph") graph = nx.DiGraph() quotes = set() roots = set() for tweet in data: if tweet['in_reply_to_status_id_str'] is not None: graph.add_edge(tweet['in_reply_to_status_id_str'], tweet['id_str']) else: graph.add_node(tweet['id_str']) roots.add(tweet['id_str']) if 'quoted_status_id_str' in tweet and tweet[ 'quoted_status_id_str'] is not None: quotes.add(tweet['quoted_status_id_str']) # dfs to get longest chain chains = [] if bool(roots): chains = dfs_chains(graph, roots) if not bool(chains): chains = [list(roots.union(quotes))] # Assign main thread as the longest chain main_thread = max(chains, key=lambda x: len(x)) main_set = set(main_thread) main_index = chains.index(main_thread) # assign secondary conversations rest = chains[:main_index] + chains[main_index + 1:] rest = [x for x in rest if bool(x)] # Remove duplications cleaned_rest = [] for thread in rest: cleaned = [x for x in thread if x not in main_set] cleaned_rest.append(cleaned) main_set.update(cleaned) # create accessor to summary file summary = TwitterUserSummary(screen_name, combined_threads_dir) if summary.user is None: if head_user in user_lookup: summary.set_user(user_lookup[head_user]) else: summary.set_user(screen_name) summary.add_thread(main_thread, cleaned_rest, quotes, tweets) # write out user file summary.write()