def start_language_processing(parser, args): from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type parser.add_argument('--lang', '--languages', dest='languages', help="The list of languages to match, separated by commas.", required=True ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting filtering of mementos by languages...") session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) return args, logger, urimdata
def discover_timemaps(args): from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_timemaps_by_input_type parser = argparse.ArgumentParser( description="Discover the timemaps in a web archive collection.", prog="hc identify timemaps") args = process_input_args(args, parser) output_type = 'timemaps' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting timemap discovery run.") logger.info("Using {} for cache storage".format(args.cache_storage)) uritdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_timemaps_by_input_type) save_resource_data(args.output_filename, uritdata, 'timemaps', list(uritdata.keys())) logger.info("Done with timemap discovery run. Output is in {}".format( args.output_filename))
def exclude_rank(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.utils import get_web_session from hypercane.utils import save_resource_data parser = argparse.ArgumentParser( description="Include only mementos containing a score meeting the given criteria.", prog="hc filter include-only score" ) parser.add_argument('--criteria', default=1, dest='criteria', help="The numeric criteria to use when selecting which values to keep." ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting detection of documents meeting the criteria for score ...") session = get_web_session(cache_storage=args.cache_storage) # TODO: add a note about no crawling for this filter urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, 1, session, discover_mementos_by_input_type ) rankkey = extract_rank_key_from_input(urimdata) filtered_urims = [] for urim in urimdata: if not eval("{}{}".format( urimdata[urim][rankkey], args.criteria )): filtered_urims.append(urim) logger.info("Saving {} filtered URI-Ms to {}".format( len(filtered_urims), args.output_filename)) save_resource_data( args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("Done filtering mementos by scor, output is saved to {}".format( args.output_filename ))
def pubdate_else_memento_datetime(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.order.dsa1_publication_alg import order_by_dsa1_publication_alg parser = argparse.ArgumentParser( description= "Order by publication date first, fall back to memento-datetime.", prog="hc order pubdate_else_memento_datetime") args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) logger.info( "Starting ordering of the documents by the DSA1 publication algorithm..." ) session = get_web_session(cache_storage=args.cache_storage) if args.input_type == "mementos": # urims = extract_uris_from_input(args.input_arguments) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) else: # TODO: derive URI-Ms from input type raise NotImplementedError( "Input type of {} not yet supported for ordering".format( args.input_type)) logger.info("extracted {} mementos from input".format(len( urimdata.keys()))) ordered_urims = order_by_dsa1_publication_alg(list(urimdata.keys()), args.cache_storage) logger.info("placed {} mementos in order".format(len(ordered_urims))) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info("Finished ordering documents, output is at {}".format( args.output_filename))
def report_seedstats(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_original_resources_by_input_type from hypercane.report.seedstats import calculate_domain_diversity, \ calculate_path_depth_diversity, most_frequent_seed_uri_path_depth, \ calculate_top_level_path_percentage, calculate_percentage_querystring import json parser = argparse.ArgumentParser( description= "Provide a report containing statistics on the original-resources derived from the input.", prog="hc report seed-statistics") args = process_input_args(args, parser) output_type = 'original-resources' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection original resource statistics run") urirs = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_original_resources_by_input_type) output = {} output['number of original-resources'] = len(urirs) output['domain diversity'] = calculate_domain_diversity(urirs) output['path depth diversity'] = calculate_path_depth_diversity(urirs) output['most frequent path depth'] = most_frequent_seed_uri_path_depth( urirs) output[ 'percentage of top-level URIs'] = calculate_top_level_path_percentage( urirs) output['query string percentage'] = calculate_percentage_querystring(urirs) with open(args.output_filename, 'w') as report_file: json.dump(output, report_file, indent=4) logger.info( "Done with collection original resource statistics report, output is in {}" .format(args.output_filename))
def synthesize_warcs(args): import argparse from hypercane.actions import get_logger, calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders import os from datetime import datetime import otmt from hashlib import md5 import traceback parser = argparse.ArgumentParser( description="Create WARCs from the mementos in a web archive collection.", prog="hc synthesize files" ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting generation of files from input") urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("discovered {} URI-Ms from the input".format(len(urimdata))) if not os.path.exists(args.output_directory): logger.info("Output directory {} does not exist, creating...".format(args.output_directory)) os.makedirs(args.output_directory) from hypercane.synthesize.warcs import synthesize_warc # TODO: make this multithreaded for urim in urimdata.keys(): try: synthesize_warc(urim, session, args.output_directory) except Exception: logger.exception("failed to generate WARC for URI-M {}".format(urim)) hypercane.errors.errorstore.add(urim, traceback.format_exc()) logger.info("Done generating directory of files, output is at {}".format(args.output_directory))
def time_slice(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.cluster.time_slice import execute_time_slice parser = argparse.ArgumentParser( description="Cluster the input into slices based on memento-datetime.", prog="hc cluster time-slice" ) parser.add_argument('-k', dest='k', default=None, type=int, help='The number of clusters to create.' ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Beginning time slicing of collection...") session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("There were {} mementos discovered in the input".format(len(urimdata))) urimdata_with_slices = execute_time_slice( urimdata, args.cache_storage, number_of_slices=args.k) # we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys())) logger.info("finished time slicing, output is available at {}".format(args.output_filename))
def image_count_scoring(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.score.image_count import score_by_image_count parser = argparse.ArgumentParser( description= "Score the input using the number of images detected in each memento.", prog="hc score image-count") args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Beginning the scoring by image count") if args.input_type == "mementos": urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) else: # TODO: derive URI-Ms from input type raise NotImplementedError( "Input type of {} not yet supported for scoring".format( args.input_type)) logger.info("using session {}".format(session)) logger.info("using cache storage: {}".format(args.cache_storage)) urimdata = score_by_image_count(urimdata, session) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info("Finished scoring by image count, output is at {}".format( args.output_filename))
def include_urir(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.hfilter.containing_urir import filter_by_urir from hypercane.utils import save_resource_data parser = argparse.ArgumentParser( description="Include only mementos with an original resource matching the given pattern.", prog="hc filter include-only containing-url-pattern" ) parser.add_argument('--url-pattern', '--urir-pattern', dest='urir_pattern', help="The regular expression pattern of the URL to match (as Python regex)", required=True ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting detection of mementos whose original resource URL matches pattern {}...".format(args.urir_pattern)) session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, 1, session, discover_mementos_by_input_type ) urims = list(urimdata.keys()) filtered_urims = filter_by_urir(urims, args.cache_storage, args.urir_pattern) save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("Completed detection of mementos whose original resource URL matches pattern {}, output is in {}".format( args.urir_pattern, args.output_filename ))
def include_largest_clusters(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.hfilter.largest_cluster import return_largest_clusters from hypercane.utils import save_resource_data parser = argparse.ArgumentParser( description="Include only mementos from the largest clusters. Input must contain cluster information. If two clusters have the same size, the first listed in the input is returned.", prog="hc filter include-only largest-cluster" ) parser.add_argument('--cluster-count', default=1, dest='cluster_count', help="The number of clusters' worth of mementos to returned, sorted descending by cluster size." ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting detection of mementos in the largest cluster...") session = get_web_session(cache_storage=args.cache_storage) # TODO: add a note about no crawling for this filter urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, 1, session, discover_mementos_by_input_type ) filtered_urims = return_largest_clusters(urimdata, int(args.cluster_count)) logger.info("returning largest cluster with {} mementos".format(len(filtered_urims))) save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("Completed detection of mementos in the largest cluster, output is in {}".format( args.output_filename ))
def discover_collection_metadata(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_original_resources_by_input_type import json parser = argparse.ArgumentParser( description= "Discover the collection metadata in a web archive collection. Only Archive-It is supported at this time.", prog="hc report metadata") args = process_input_args(args, parser) output_type = 'original-resources' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection metadata discovery run.") if args.input_type == 'archiveit': metadata = generate_collection_metadata(args.input_arguments, session) else: logger.warning( "Metadata reports are only supported for Archive-It collections, proceeding to create JSON output for URI-Rs." ) urirdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_original_resources_by_input_type) metadata = generate_blank_metadata(list(urirdata.keys())) with open(args.output_filename, 'w') as metadata_file: json.dump(metadata, metadata_file, indent=4) logger.info("Done with collection metadata discovery run.")
def include_highest_score_per_cluster(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.hfilter.highest_rank_per_cluster import return_highest_ranking_memento_per_cluster from hypercane.utils import save_resource_data parser = argparse.ArgumentParser( description="Include only mementos with the highest score from each cluster.", prog="hc filter include-only highest-score-per-cluster" ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting detection of mementos with the highest score in each cluster...") session = get_web_session(cache_storage=args.cache_storage) # TODO: add a note about no crawling for this filter urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, 1, session, discover_mementos_by_input_type ) rankkey = extract_rank_key_from_input(urimdata) logger.info("using score key {}".format(rankkey)) filtered_urims = return_highest_ranking_memento_per_cluster(urimdata, rankkey) save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("Completed detection of mementos with the highest score in each cluster, output is in {}".format( args.output_filename ))
def cluster_by_urir(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.cluster.original_resource import cluster_by_urir parser = argparse.ArgumentParser( description="Cluster the input based on domain name.", prog="hc cluster domainname" ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Beginning original resource URI clustering of collection...") session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("There were {} mementos discovered in the input".format(len(urimdata))) urimdata_with_clusters = cluster_by_urir(urimdata, args.cache_storage) # we use urimdata and urimdata_with_clusters because they should match, if they don't we will detect an error save_resource_data(args.output_filename, urimdata_with_clusters, 'mementos', list(urimdata.keys())) logger.info("finished clustering by original resource URI, output is available at {}".format(args.output_filename))
def bm25_ranking(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.score.bm25 import rank_by_bm25 parser = argparse.ArgumentParser( description="Score the input using a query and the BM25 algorithm.", prog="hc score bm25") parser.add_argument('--query', dest='query', required=True, help="The query to use with BM25") args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Beginning the scoring by BM25") urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) urimdata = rank_by_bm25(urimdata, session, args.query, args.cache_storage) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info("Finished scoring by BM25, output is at {}".format( args.output_filename))
def start_containing_pattern(parser, args, include): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import save_resource_data, get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.hfilter.containing_pattern import filter_pattern parser.add_argument('--pattern', dest='pattern_string', help="The regular expression pattern to match (as Python regex)", required=True ) args = process_input_args(args, parser) output_type = 'mementos' session = get_web_session(cache_storage=args.cache_storage) logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting filter of mementos containing pattern...") urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) urims = list(urimdata.keys()) filtered_urims = filter_pattern( urims, args.cache_storage, args.pattern, include ) save_resource_data( args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("done filtering mementos by pattern, output is in {}".format(args.output_filename))
def remove_near_duplicates(parser, args): from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.hfilter.near_duplicates import filter_near_duplicates from hypercane.utils import save_resource_data args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting detection of near-duplicate mementos...") session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.debug("urimdata: {}".format(urimdata)) urims = list(urimdata.keys()) filtered_urims = filter_near_duplicates(urims, args.cache_storage) logger.info("writing {} to {}".format(filtered_urims, args.output_filename)) save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("Completed detection of near-duplicates, output is saved to {}".format(args.output_filename))
def include_near_datetime(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.hfilter.near_datetime import filter_by_memento_datetime from hypercane.utils import save_resource_data from datetime import datetime parser = argparse.ArgumentParser( description="Include mementos whose memento-datetimes fall within the range of start-datetime and end-datetime.", prog="hc filter include-only near-datetime" ) parser.add_argument('--start-datetime', '--lower-datetime', dest='lower_datetime', help="The lower bound datetime in YYYY-mm-ddTHH:MM:SS format.", required=True ) parser.add_argument('--end-datetime', '--upper-datetime', dest='upper_datetime', help="The upper bound datetime in YYYY-mm-ddTHH:MM:SS format.", required=True ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting filtering of mementos by memento-datetime...") lower_datetime = datetime.strptime( args.lower_datetime, "%Y-%m-%dT%H:%M:%S" ) upper_datetime = datetime.strptime( args.upper_datetime, "%Y-%m-%dT%H:%M:%S" ) session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) urims = list(urimdata.keys()) filtered_urims = filter_by_memento_datetime( urims, args.cache_storage, lower_datetime, upper_datetime) save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims) logger.info("done filtering mementos by memento-datetime, output is in {}".format(args.output_filename))
def report_metadatastats(args): import sys import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type, discover_original_resources_by_input_type from hypercane.report.metadatastats import get_pct_seeds_with_metadata, \ get_pct_seeds_with_specific_field, get_pct_seeds_with_title, \ get_pct_seeds_with_description, get_mean_default_field_score, \ get_metadata_compression_ratio, get_mean_raw_field_count import json parser = argparse.ArgumentParser( description= "Discover the collection metadata in a web archive collection. Only Archive-It is supported at this time.", prog="hc report metadata") args = process_input_args(args, parser) output_type = 'original-resources' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection metadata statistics run.") output = {} if args.input_type == 'archiveit': metadata = generate_collection_metadata(args.input_arguments, session) output['id'] = metadata['id'] output['archived since'] = metadata['archived_since'] output['# of seeds'] = len(metadata['seed_metadata']['seeds']) output['% of seeds with any metadata'] = get_pct_seeds_with_metadata( metadata) output['% title field use'] = get_pct_seeds_with_title(metadata) output['% description field use'] = get_pct_seeds_with_description( metadata) output[ 'mean default fields metadata score'] = get_mean_default_field_score( metadata) output[ 'mean non-normalized metadata count'] = get_mean_raw_field_count( metadata) output['metadata compression ratio'] = get_metadata_compression_ratio( metadata) else: logger.critical( "Metadata statistics are only supported for Archive-It collections" ) sys.exit(255) with open(args.output_filename, 'w') as report_file: json.dump(output, report_file, indent=4) logger.info("Done with collection metadata discovery run.")
def cluster_by_lda(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.cluster.lda import cluster_with_lda parser = argparse.ArgumentParser( description="Cluster the input based on LDA topic modeling with gensim.", prog="hc cluster lda" ) # TODO: add argument for top scoring cluster (default) or all of them parser.add_argument('--num_topics', dest='num_topics', default=20, required=False, type=int, help='The number of topics to cluster.' ) parser.add_argument('--num_passes', dest='num_passes', default=2, required=False, type=int, help='The number of passes through the corpus during training. This corresponds to the Gensim LDA setting of the same name. See: https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html' ) parser.add_argument('--num_iterations', dest='num_iterations', default=50, required=False, type=int, help='The number of iterations through each document during training. This corresponds to the Gensim LDA setting of the same name. See: https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html' ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Beginning LDA clustering of collection...") session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("There were {} mementos discovered in the input".format(len(urimdata))) urimdata_with_slices = cluster_with_lda(urimdata, args.cache_storage, args.num_topics, args.num_iterations, args.num_passes) # we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys())) logger.info("finished clustering with LDA, output is available at {}".format(args.output_filename))
def dsa1_scoring(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.score.dsa1_ranking import rank_by_dsa1_score parser = argparse.ArgumentParser( description="Score the input using the DSA1 scoring equation.", prog="hc score dsa1-scoring") parser.add_argument( '--memento-damage-url', dest='memento_damage_url', default=None, help="The URL of the Memento-Damage service to use for scoring.") parser.add_argument( '--damage-weight', dest='damage_weight', default=-0.40, type=float, help="The weight for the Memento-Damage score in the scoring.") parser.add_argument( '--category-weight', dest='category_weight', default=0.15, type=float, help="The weight for the URI-R category score in the scoring.") parser.add_argument( '--path-depth-weight', dest='path_depth_weight', default=0.45, type=float, help="The weight for the URI-R path depth score in the scoring.") args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Beginning the scoring by DSA1 scoring equation") if args.input_type == "mementos": urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) else: # TODO: derive URI-Ms from input type raise NotImplementedError( "Input type of {} not yet supported for scoring".format( args.input_type)) urimdata = rank_by_dsa1_score(urimdata, session, memento_damage_url=args.memento_damage_url, damage_weight=args.damage_weight, category_weight=args.category_weight, path_depth_weight=args.path_depth_weight) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info( "Finished ranking by DSA1 scoring equation, output is at {}".format( args.output_filename))
def discover_mementos(args): from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type parser = argparse.ArgumentParser( description="Discover the mementos in a web archive collection.", prog="hc identify mementos") parser.add_argument( '--accept-datetime', '--desired-datetime', default=None, required=False, dest='accept_datetime', help='(only for original resource input type)\n' 'discover mementos closest to this datetime in YYYY-mm-ddTHH:MM:SS format', type=lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S')) parser.add_argument( '--timegates', default=[ "https://timetravel.mementoweb.org/timegate/", "https://web.archive.org/web/" ], required=False, dest='timegates', help='(only for original resource input type)\n' 'use the given TimeGate endpoints to discover mementos', type=lambda s: [i.strip() for i in s.split(',')]) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting memento discovery run.") logger.info("Using {} for cache storage".format(args.cache_storage)) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type, accept_datetime=args.accept_datetime, timegates=args.timegates) logger.info( "discovered {} mementos, preparing to write the list to {}".format( len(urimdata), args.output_filename)) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info("Done with memento discovery run. Output is in {}".format( args.output_filename))
def cluster_by_kmeans(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data, \ get_raw_simhash, get_tf_simhash from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.cluster.kmeans import cluster_by_memento_datetime parser = argparse.ArgumentParser( description="Cluster the input using the dbscan algorithm.", prog="hc cluster kmeans" ) parser.add_argument('--feature', dest='feature', default='memento-datetime', help='The feature in which to cluster the documents.' ) parser.add_argument('-k', dest='k', default=28, type=int, help='The number of clusters to create.' ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Beginning the clustering of the collection by K-means with feature {}...".format(args.feature)) session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("There were {} mementos discovered in the input".format(len(urimdata))) k = args.k if len(urimdata) < args.k: k = len(urimdata) if args.feature == 'memento-datetime': urimdata = cluster_by_memento_datetime(urimdata, args.cache_storage, k) else: raise NotImplementedError("Clustering feature of {} not yet supported.".format(args.feature)) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info("Clustering of collection into {} clusters via K-means on feature {} is complete," "output is available in {}".format(args.k, args.feature, args.output_filename))
def cluster_by_dbscan(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session, save_resource_data, \ get_raw_simhash, get_tf_simhash from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.cluster.dbscan import cluster_by_simhash_distance, \ cluster_by_memento_datetime parser = argparse.ArgumentParser( description="Cluster the input using the dbscan algorithm.", prog="hc cluster dbscan" ) parser.add_argument('--feature', dest='feature', default='tf-simhash', help='The feature in which to cluster the documents.' ) parser.add_argument('--eps', dest='eps', default=0.5, help='The maximum distance between two samples for one to be considered as in the neighbordhood of the other. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html' ) parser.add_argument('--min-samples', dest='min_samples', default=5, help="The number of samples in a neighbordhood for a point to be considered as a core point. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html" ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Beginning the clustering of the collection by dbscan...") session = get_web_session(cache_storage=args.cache_storage) urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("There were {} mementos discovered in the input".format(len(urimdata))) if args.feature == "raw-simhash": logger.info("Clustering URI-Ms by Raw Simhash") urimdata = cluster_by_simhash_distance( urimdata, args.cache_storage, simhash_function=get_raw_simhash, min_samples=int(args.min_samples), eps=float(args.eps)) elif args.feature == "tf-simhash": logger.info("Clustering URI-Ms by Term Frequency Simhash") urimdata = cluster_by_simhash_distance( urimdata, args.cache_storage, simhash_function=get_tf_simhash, min_samples=int(args.min_samples), eps=float(args.eps)) elif args.feature == "memento-datetime": logger.info("Clustering URI-Ms by Memento-Datetime") urimdata = cluster_by_memento_datetime( urimdata, args.cache_storage, min_samples=int(args.min_samples), eps=float(args.eps)) else: raise NotImplementedError("Clustering feature of {} not yet supported.".format(args.feature)) save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys())) logger.info("Clustering of collection via DBSCAN on feature {} is complete, output is in {}".format(args.feature, args.output_filename))
def combine_files(args): import argparse import json import csv from hypercane.actions import get_logger, calculate_loglevel, \ process_input_args parser = argparse.ArgumentParser( description="Combine the output from several Hypercane commands into one TSV file.", prog="hc synthesize combine" ) parser.add_argument('--append-files', dest='append_files', help='the Hypercane files to append to the file specified by the -a command', nargs='*' ) args = process_input_args(args, parser) logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) logger.info("Starting combination of files from input") if args.input_type == 'archiveit': msg = "Input type archiveit not yet implemented, choose mementos, timemaps, or orignal-resources instead" logger.exception(msg) raise NotImplementedError(msg) allfiles = [] allfiles.append( args.input_arguments ) allfiles.extend( args.append_files ) fieldnames = [] for filename in allfiles: with open(filename) as g: csvreader = csv.reader(g, delimiter='\t') fieldnames.extend( next(csvreader) ) logger.info("detected fieldnames: {}".format(fieldnames)) firstfield = None for input_field in ['URI-M', 'URI-T', 'URI-R']: input_field_count = fieldnames.count(input_field) if input_field_count == len(allfiles): if input_field_count > 0: firstfield = input_field break else: msg = "All input files must contain the same input type, either mementos, timemaps, or original-resources" logger.critical(msg) raise RuntimeError(msg) output_fieldnames = list(set(fieldnames)) output_fieldnames.remove(firstfield) output_fieldnames.insert(0, firstfield) with open(args.output_filename, 'w') as f: writer = csv.DictWriter(f, delimiter='\t', fieldnames=output_fieldnames) writer.writeheader() for filename in allfiles: with open(filename) as g: csvreader = csv.DictReader(g, delimiter='\t') for row in csvreader: outputrow = {} outputrow[firstfield] = row[firstfield] for fieldname in output_fieldnames: try: outputrow[fieldname] = row[fieldname] except KeyError: outputrow[fieldname] = None writer.writerow(outputrow) logger.info("Writing new file to {}".format(args.output_filename))
def report_growth_curve_stats(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_timemaps_by_input_type from hypercane.report.growth import get_last_memento_datetime, \ get_first_memento_datetime, process_timemaps_for_mementos, \ calculate_mementos_per_seed, calculate_memento_seed_ratio, \ calculate_number_of_mementos, parse_data_for_mementos_list, \ convert_mementos_list_into_mdts_pct_urim_pct_and_urir_pct, \ draw_both_axes_pct_growth import json from sklearn.metrics import auc parser = argparse.ArgumentParser( description= "Provide a report containing statistics growth of mementos derived from the input.", prog="hc report growth") parser.add_argument( '--growth-curve-file', dest='growthcurve_filename', help= "If present, draw a growth curve and write it to the filename specified.", default=None, required=False) args = process_input_args(args, parser) output_type = 'original-resources' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection original resource statistics run") urits = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_timemaps_by_input_type) timemap_data, errors_data = process_timemaps_for_mementos(urits, session) mementos_list = parse_data_for_mementos_list(timemap_data) mdts_pct, urims_pct, urirs_pct = \ convert_mementos_list_into_mdts_pct_urim_pct_and_urir_pct( mementos_list) output = {} output['auc_memento_curve'] = auc(mdts_pct, urims_pct) output['auc_seed_curve'] = auc(mdts_pct, urirs_pct) output['auc_memento_minus_diag'] = output['auc_memento_curve'] - 0.5 output['auc_seed_minus_diag'] = output['auc_seed_curve'] - 0.5 output['auc_seed_minus_auc_memento'] = output['auc_seed_curve'] - output[ 'auc_memento_curve'] output['memento_seed_ratio'] = calculate_memento_seed_ratio(timemap_data) output['mementos_per_seed'] = calculate_mementos_per_seed(timemap_data) output['first_memento_datetime'] = get_first_memento_datetime(timemap_data) output['last_memento_datetime'] = get_last_memento_datetime(timemap_data) output["number_of_original_resources"] = len(urits) output["number_of_mementos"] = calculate_number_of_mementos(timemap_data) output['lifespan_secs'] = ( get_last_memento_datetime(timemap_data) - get_first_memento_datetime(timemap_data)).total_seconds() output['lifespan_mins'] = output['lifespan_secs'] / 60 output['lifespan_hours'] = output['lifespan_secs'] / 60 / 60 output['lifespan_days'] = output['lifespan_secs'] / 60 / 60 / 24 output['lifespan_weeks'] = output['lifespan_secs'] / 60 / 60 / 24 / 7 output['lifespan_years'] = output['lifespan_secs'] / 60 / 60 / 24 / 365 with open(args.output_filename, 'w') as report_file: json.dump(output, report_file, indent=4, default=dtconverter) logger.info( "Done with collection growth statistics, report saved to {}".format( args.output_filename)) if args.growthcurve_filename is not None: logger.info("Beginning to render collection growth curve...") draw_both_axes_pct_growth(mdts_pct, urims_pct, urirs_pct, args.growthcurve_filename) logger.info("Growth curve saved to {}".format( args.growthcurve_filename))
def report_ranked_terms(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type import json parser = argparse.ArgumentParser( description= "Provide a report containing the terms from the collection and their associated frequencies.", prog="hc report terms") parser.add_argument('--ngram-length', help="The size of the n-grams", dest='ngram_length', default=1, type=int) parser.add_argument( '--sumgrams', '--use-sumgrams', help="If specified, generate sumgrams rather than n-grams.", action='store_true', default=False, dest='use_sumgrams') parser.add_argument('--added-stopwords', help="If specified, add stopwords from this file.", dest='added_stopword_filename', default=None) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection image data run") urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) added_stopwords = [] if args.added_stopword_filename is not None: with open(args.added_stopword_filename) as f: for line in f: added_stopwords.append(line.strip()) if args.use_sumgrams is True: from hypercane.report.sumgrams import generate_sumgrams from hypercane import package_directory ranked_terms = generate_sumgrams(list(urimdata.keys()), args.cache_storage, added_stopwords=added_stopwords) with open(args.output_filename, 'w') as f: f.write("Term\tFrequency in Corpus\tTerm Rate\n") for term, frequency, term_rate in ranked_terms: f.write("{}\t{}\t{}\n".format(term, frequency, term_rate)) else: from hypercane.report.terms import generate_ranked_terms ranked_terms = generate_ranked_terms(list(urimdata.keys()), args.cache_storage, ngram_length=args.ngram_length, added_stopwords=added_stopwords) with open(args.output_filename, 'w') as f: f.write( "Term\tFrequency in Corpus\tProbability in Corpus\tDocument Frequency\tInverse Document Frequency\tCorpus TF-IDF\n" ) for term, frequency, probability, df, idf, tfidf in ranked_terms: f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( term, frequency, probability, df, idf, tfidf)) logger.info( "Done with collection term frequency report, output is in {}".format( args.output_filename))
def report_entities(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hypercane.report.entities import generate_entities import json parser = argparse.ArgumentParser( description= "Provide a report containing the terms from the collection and their associated frequencies.", prog="hc report entities") default_entity_types = [ 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW' ] parser.add_argument( '--entity-types', help= "The types of entities to report, from https://spacy.io/api/annotation#named-entities", dest='entity_types', default=default_entity_types, type=int) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection image data run") urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) ranked_terms = generate_entities(list(urimdata.keys()), args.cache_storage, args.entity_types) with open(args.output_filename, 'w') as f: f.write( "Entity\tFrequency in Corpus\tProbability in Corpus\tDocument Frequency\tInverse Document Frequency\tCorpus TF-IDF\n" ) for term, frequency, probability, df, idf, tfidf in ranked_terms: f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(term, frequency, probability, df, idf, tfidf)) logger.info( "Done with collection term frequency report, output is in {}".format( args.output_filename))
def synthesize_bpfree_files(args): import os import argparse from hypercane.actions import get_logger, calculate_loglevel from hypercane.utils import get_web_session, get_boilerplate_free_content from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type from hashlib import md5 import otmt from justext import justext, get_stoplist import traceback parser = argparse.ArgumentParser( description="Save boilerplate-free copies of mementos as files from a web archive collection.", prog="hc synthesize bpfree-files" ) args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile ) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting generation of boilerplate-free files from input") urimdata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type ) logger.info("discovered {} URI-Ms from the input".format(len(urimdata))) if not os.path.exists(args.output_directory): logger.info("Output directory {} does not exist, creating...".format(args.output_directory)) os.makedirs(args.output_directory) # TODO: make this multithreaded with open("{}/metadata.tsv".format(args.output_directory), 'w') as metadatafile: for urim in urimdata.keys(): try: bpfree = get_boilerplate_free_content(urim, cache_storage=args.cache_storage) m = md5() m.update(urim.encode('utf8')) urlhash = m.hexdigest() newfilename = urlhash + '.dat' logger.info("writing out data for URI-M {}".format(urim)) with open("{}/{}".format( args.output_directory, newfilename), 'wb') as newfile: newfile.write(bpfree) metadatafile.write("{}\t{}\n".format(urim, newfilename)) except Exception as exc: logger.exception('URI-M [{}] generated an exception: [{}], skipping...'.format(urim, repr(exc))) hypercane.errors.errorstore.add(urim, traceback.format_exc()) logger.info("Done generating directory of boilerplate-free files, output is at {}".format(args.output_directory))
def report_image_data(args): import argparse from hypercane.actions import process_input_args, get_logger, \ calculate_loglevel from hypercane.utils import get_web_session from hypercane.identify import discover_resource_data_by_input_type, \ discover_mementos_by_input_type, discover_original_resources_by_input_type from hypercane.report.imagedata import generate_image_data, \ rank_images, output_image_data_as_jsonl import json parser = argparse.ArgumentParser( description= "Provide a report on the images from in the mementos discovered in the input.", prog="hc report image-data") parser.add_argument( '--use-urirs', required=False, dest='use_urirs', action='store_true', help= "Regardless of headers, assume the input are URI-Rs and do not try to archive or convert them to URI-Ms." ) parser.add_argument( '--output-format', required=False, dest="output_format", default="json", help="Choose the output format, valid formats are JSON and JSONL") args = process_input_args(args, parser) output_type = 'mementos' logger = get_logger( __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet), args.logfile) session = get_web_session(cache_storage=args.cache_storage) logger.info("Starting collection image data run") if args.use_urirs == True: uridata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_original_resources_by_input_type) else: uridata = discover_resource_data_by_input_type( args.input_type, output_type, args.input_arguments, args.crawl_depth, session, discover_mementos_by_input_type) if args.output_format == 'json': metadata = {} metadata['image data'] = generate_image_data(uridata, args.cache_storage) metadata['ranked data'] = rank_images(metadata['image data']) with open(args.output_filename, 'w') as metadata_file: json.dump(metadata, metadata_file, indent=4) elif args.output_format == 'jsonl': output_image_data_as_jsonl(uridata, args.output_filename, args.cache_storage) logger.info("Done with collection image data run, output is at {}".format( args.output_filename))