Ejemplo n.º 1
0
def discover_timemaps(args):

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type

    parser = argparse.ArgumentParser(
        description="Discover the timemaps in a web archive collection.",
        prog="hc identify timemaps")

    args = process_input_args(args, parser)
    output_type = 'timemaps'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting timemap discovery run.")
    logger.info("Using {} for cache storage".format(args.cache_storage))

    uritdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_timemaps_by_input_type)

    save_resource_data(args.output_filename, uritdata, 'timemaps',
                       list(uritdata.keys()))

    logger.info("Done with timemap discovery run. Output is in {}".format(
        args.output_filename))
Ejemplo n.º 2
0
def start_language_processing(parser, args):

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    parser.add_argument('--lang', '--languages', dest='languages',
        help="The list of languages to match, separated by commas.",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting filtering of mementos by languages...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    return args, logger, urimdata
Ejemplo n.º 3
0
def exclude_rank(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.utils import get_web_session
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos containing a score meeting the given criteria.",
        prog="hc filter include-only score"
    )

    parser.add_argument('--criteria', default=1, dest='criteria',
        help="The numeric criteria to use when selecting which values to keep."
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of documents meeting the criteria for score ...")

    session = get_web_session(cache_storage=args.cache_storage)

    # TODO: add a note about no crawling for this filter
    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    rankkey = extract_rank_key_from_input(urimdata)

    filtered_urims = []

    for urim in urimdata:
        if not eval("{}{}".format(
            urimdata[urim][rankkey], args.criteria
            )):
            filtered_urims.append(urim)

    logger.info("Saving {} filtered URI-Ms to {}".format(
        len(filtered_urims), args.output_filename))

    save_resource_data(
        args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Done filtering mementos by scor, output is saved to {}".format(
        args.output_filename
    ))
Ejemplo n.º 4
0
def pubdate_else_memento_datetime(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.order.dsa1_publication_alg import order_by_dsa1_publication_alg

    parser = argparse.ArgumentParser(
        description=
        "Order by publication date first, fall back to memento-datetime.",
        prog="hc order pubdate_else_memento_datetime")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    logger.info(
        "Starting ordering of the documents by the DSA1 publication algorithm..."
    )

    session = get_web_session(cache_storage=args.cache_storage)

    if args.input_type == "mementos":
        # urims = extract_uris_from_input(args.input_arguments)
        urimdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)
    else:
        # TODO: derive URI-Ms from input type
        raise NotImplementedError(
            "Input type of {} not yet supported for ordering".format(
                args.input_type))

    logger.info("extracted {} mementos from input".format(len(
        urimdata.keys())))

    ordered_urims = order_by_dsa1_publication_alg(list(urimdata.keys()),
                                                  args.cache_storage)

    logger.info("placed {} mementos in order".format(len(ordered_urims)))

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Finished ordering documents, output is at {}".format(
        args.output_filename))
Ejemplo n.º 5
0
def report_seedstats(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_original_resources_by_input_type

    from hypercane.report.seedstats import calculate_domain_diversity, \
        calculate_path_depth_diversity, most_frequent_seed_uri_path_depth, \
        calculate_top_level_path_percentage, calculate_percentage_querystring

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing statistics on the original-resources derived from the input.",
        prog="hc report seed-statistics")

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection original resource statistics run")

    urirs = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_original_resources_by_input_type)

    output = {}
    output['number of original-resources'] = len(urirs)
    output['domain diversity'] = calculate_domain_diversity(urirs)
    output['path depth diversity'] = calculate_path_depth_diversity(urirs)
    output['most frequent path depth'] = most_frequent_seed_uri_path_depth(
        urirs)
    output[
        'percentage of top-level URIs'] = calculate_top_level_path_percentage(
            urirs)
    output['query string percentage'] = calculate_percentage_querystring(urirs)

    with open(args.output_filename, 'w') as report_file:
        json.dump(output, report_file, indent=4)

    logger.info(
        "Done with collection original resource statistics report, output is in {}"
        .format(args.output_filename))
Ejemplo n.º 6
0
def synthesize_warcs(args):

    import argparse
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from warcio.warcwriter import WARCWriter
    from warcio.statusandheaders import StatusAndHeaders
    import os
    from datetime import datetime
    import otmt
    from hashlib import md5
    import traceback

    parser = argparse.ArgumentParser(
        description="Create WARCs from the mementos in a web archive collection.",
        prog="hc synthesize files"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting generation of files from input")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("discovered {} URI-Ms from the input".format(len(urimdata)))

    if not os.path.exists(args.output_directory):
        logger.info("Output directory {} does not exist, creating...".format(args.output_directory))
        os.makedirs(args.output_directory)

    from hypercane.synthesize.warcs import synthesize_warc

    # TODO: make this multithreaded
    for urim in urimdata.keys():
        try:
            synthesize_warc(urim, session, args.output_directory)
        except Exception:
            logger.exception("failed to generate WARC for URI-M {}".format(urim))
            hypercane.errors.errorstore.add(urim, traceback.format_exc())

    logger.info("Done generating directory of files, output is at {}".format(args.output_directory))
Ejemplo n.º 7
0
def remove_offtopic(parser, args):

    import argparse
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session
    from pymongo import MongoClient
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type, download_urits_and_extract_urims
    from hypercane.hfilter.remove_offtopic import detect_off_topic
    from hypercane.utils import save_resource_data

    args = process_remove_offtopic_args(args, parser)
    processing_type = 'timemaps'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of off-topic documents...")

    session = get_web_session(cache_storage=args.cache_storage)
    dbconn = MongoClient(args.cache_storage)

    if args.input_type == 'mementos':
        logger.warning(
            "Beware that an input type of 'mementos' may cause unexpected behavior. Specific mementos will be converted to TimeMaps and thus provide more mementos for consideration of off-topic analysis than were submitted."
        )

    uritdata = discover_resource_data_by_input_type(
        args.input_type, processing_type, args.input_arguments, args.crawl_depth,
        session, discover_timemaps_by_input_type
    )

    urits = list(uritdata.keys())
    urims = download_urits_and_extract_urims(urits, session)

    ontopic_mementos = detect_off_topic(
        dbconn, session, urits, urims, args.timemap_measures,
        num_topics=args.num_topics)

    logger.info("discovered {} on-topic mementos".format(len(ontopic_mementos)))

    # when reading in TimeMap URIs and writing out mementos, the urimdata will not match
    urimdata = {}
    for urim in ontopic_mementos:
        urimdata[urim] = {}

    save_resource_data(args.output_filename, urimdata, 'mementos', ontopic_mementos)

    logger.info("done with off-topic run, on-topic mementos are in {}".format(args.output_filename))
Ejemplo n.º 8
0
def time_slice(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.time_slice import execute_time_slice

    parser = argparse.ArgumentParser(
        description="Cluster the input into slices based on memento-datetime.",
        prog="hc cluster time-slice"
    )

    parser.add_argument('-k', dest='k',
        default=None, type=int,
        help='The number of clusters to create.'
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning time slicing of collection...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    urimdata_with_slices = execute_time_slice(
        urimdata, args.cache_storage, number_of_slices=args.k)

    # we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error
    save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys()))

    logger.info("finished time slicing, output is available at {}".format(args.output_filename))
Ejemplo n.º 9
0
def image_count_scoring(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.score.image_count import score_by_image_count

    parser = argparse.ArgumentParser(
        description=
        "Score the input using the number of images detected in each memento.",
        prog="hc score image-count")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Beginning the scoring by image count")

    if args.input_type == "mementos":
        urimdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)
    else:
        # TODO: derive URI-Ms from input type
        raise NotImplementedError(
            "Input type of {} not yet supported for scoring".format(
                args.input_type))

    logger.info("using session {}".format(session))
    logger.info("using cache storage: {}".format(args.cache_storage))

    urimdata = score_by_image_count(urimdata, session)

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Finished scoring by image count, output is at {}".format(
        args.output_filename))
Ejemplo n.º 10
0
def include_urir(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.containing_urir import filter_by_urir
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos with an original resource matching the given pattern.",
        prog="hc filter include-only containing-url-pattern"
    )

    parser.add_argument('--url-pattern', '--urir-pattern', dest='urir_pattern',
        help="The regular expression pattern of the URL to match (as Python regex)",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of mementos whose original resource URL matches pattern {}...".format(args.urir_pattern))

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    urims = list(urimdata.keys())

    filtered_urims = filter_by_urir(urims, args.cache_storage, args.urir_pattern)

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of mementos whose original resource URL matches pattern {}, output is in {}".format(
        args.urir_pattern, args.output_filename
    ))
Ejemplo n.º 11
0
def include_largest_clusters(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.largest_cluster import return_largest_clusters
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos from the largest clusters. Input must contain cluster information. If two clusters have the same size, the first listed in the input is returned.",
        prog="hc filter include-only largest-cluster"
    )

    parser.add_argument('--cluster-count', default=1, dest='cluster_count',
        help="The number of clusters' worth of mementos to returned, sorted descending by cluster size."
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of mementos in the largest cluster...")

    session = get_web_session(cache_storage=args.cache_storage)

    # TODO: add a note about no crawling for this filter
    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    filtered_urims = return_largest_clusters(urimdata, int(args.cluster_count))

    logger.info("returning largest cluster with {} mementos".format(len(filtered_urims)))

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of mementos in the largest cluster, output is in {}".format(
        args.output_filename
    ))
Ejemplo n.º 12
0
def discover_collection_metadata(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_original_resources_by_input_type

    import json

    parser = argparse.ArgumentParser(
        description=
        "Discover the collection metadata in a web archive collection. Only Archive-It is supported at this time.",
        prog="hc report metadata")

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection metadata discovery run.")

    if args.input_type == 'archiveit':
        metadata = generate_collection_metadata(args.input_arguments, session)
    else:
        logger.warning(
            "Metadata reports are only supported for Archive-It collections, proceeding to create JSON output for URI-Rs."
        )

        urirdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session,
            discover_original_resources_by_input_type)
        metadata = generate_blank_metadata(list(urirdata.keys()))

    with open(args.output_filename, 'w') as metadata_file:
        json.dump(metadata, metadata_file, indent=4)

    logger.info("Done with collection metadata discovery run.")
Ejemplo n.º 13
0
def include_highest_score_per_cluster(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.highest_rank_per_cluster import return_highest_ranking_memento_per_cluster
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos with the highest score from each cluster.",
        prog="hc filter include-only highest-score-per-cluster"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of mementos with the highest score in each cluster...")

    session = get_web_session(cache_storage=args.cache_storage)

    # TODO: add a note about no crawling for this filter
    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    rankkey = extract_rank_key_from_input(urimdata)

    logger.info("using score key {}".format(rankkey))

    filtered_urims = return_highest_ranking_memento_per_cluster(urimdata, rankkey)

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of mementos with the highest score in each cluster, output is in {}".format(
        args.output_filename
    ))
Ejemplo n.º 14
0
def cluster_by_urir(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.original_resource import cluster_by_urir

    parser = argparse.ArgumentParser(
        description="Cluster the input based on domain name.",
        prog="hc cluster domainname"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning original resource URI clustering of collection...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    urimdata_with_clusters = cluster_by_urir(urimdata, args.cache_storage)

    # we use urimdata and urimdata_with_clusters because they should match, if they don't we will detect an error
    save_resource_data(args.output_filename, urimdata_with_clusters, 'mementos', list(urimdata.keys()))

    logger.info("finished clustering by original resource URI, output is available at {}".format(args.output_filename))
Ejemplo n.º 15
0
def bm25_ranking(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.score.bm25 import rank_by_bm25

    parser = argparse.ArgumentParser(
        description="Score the input using a query and the BM25 algorithm.",
        prog="hc score bm25")

    parser.add_argument('--query',
                        dest='query',
                        required=True,
                        help="The query to use with BM25")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Beginning the scoring by BM25")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    urimdata = rank_by_bm25(urimdata, session, args.query, args.cache_storage)

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Finished scoring by BM25, output is at {}".format(
        args.output_filename))
Ejemplo n.º 16
0
def start_containing_pattern(parser, args, include):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import save_resource_data, get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.containing_pattern import filter_pattern

    parser.add_argument('--pattern', dest='pattern_string',
        help="The regular expression pattern to match (as Python regex)",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    session = get_web_session(cache_storage=args.cache_storage)

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting filter of mementos containing pattern...")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    urims = list(urimdata.keys())

    filtered_urims = filter_pattern(
        urims, args.cache_storage, args.pattern, include
    )

    save_resource_data(
        args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("done filtering mementos by pattern, output is in {}".format(args.output_filename))
Ejemplo n.º 17
0
def remove_near_duplicates(parser, args):

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.near_duplicates import filter_near_duplicates
    from hypercane.utils import save_resource_data

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of near-duplicate mementos...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.debug("urimdata: {}".format(urimdata))

    urims = list(urimdata.keys())

    filtered_urims = filter_near_duplicates(urims, args.cache_storage)

    logger.info("writing {} to {}".format(filtered_urims, args.output_filename))

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of near-duplicates, output is saved to {}".format(args.output_filename))
Ejemplo n.º 18
0
def sample_with_true_random(args):

    from hypercane.sample.true_random import select_true_random
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session, save_resource_data
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    args = sample_with_true_random_args(args)

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    if args.errorfilename is not None:
        hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(
            args.errorfilename)

    session = get_web_session(cache_storage=args.cache_storage)
    output_type = 'mementos'

    logger.info("Starting random sampling of URI-Ms.")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    logger.info("Executing select true random algorithm")
    sampled_urims = select_true_random(list(urimdata.keys()),
                                       int(args.sample_count))

    logger.info("Writing sampled URI-Ms out to {}".format(
        args.output_filename))
    save_resource_data(args.output_filename, urimdata, 'original-resources',
                       sampled_urims)

    logger.info("Done sampling.")
Ejemplo n.º 19
0
def raintale_story(args):

    import argparse
    import json
    import hypercane.actions
    from hypercane.actions import get_logger, calculate_loglevel, \
        process_input_args
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    parser = argparse.ArgumentParser(
        description="Generate a story suitable as input to Raintale.",
        prog="hc synthesize raintale-story"
    )

    parser.add_argument('--title', dest='title',
        help='The title of the story', required=False, default=None
    )

    parser.add_argument('--imagedata', dest='imagedata_filename',
        help='A file containing image data, as produced by hc report image-data',
        required=False, default=None
    )

    parser.add_argument('--termdata', dest='termdata_filename',
        help='A file containing term data, as produced by hc report terms',
        required=False, default=None
    )

    parser.add_argument('--term-count', dest='term_count',
        help='The number of top terms to select from the term data.',
        required=False, default=5
    )

    parser.add_argument('--entitydata', dest='entitydata_filename',
        help='A file containing term data, as produced by hc report entities',
        required=False, default=None
    )

    parser.add_argument('--collection_metadata', dest='collection_metadata_filename',
        help='A file containing Archive-It colleciton metadata, as produced by hc report metadata',
        required=False, default=None
    )

    parser.add_argument('--entity-count', dest='entity_count',
        help='The number of top terms to select from the term data.',
        required=False, default=5
    )

    parser.add_argument('--extradata', dest='extra_data',
        help='a JSON file containing extra data that will be included in the Raintale JSON, '
        'multiple filenames may follow this argument, '
        'the name of the file without the extension will be the JSON key', nargs='*',
        default=[]
    )

    args = hypercane.actions.process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting generation of files from input")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("discovered {} URI-Ms from the input".format(len(urimdata)))

    story_json = {
        'metadata': {}
    }

    if args.collection_metadata_filename is not None:
        with open(args.collection_metadata_filename) as f:
            jdata = json.load(f)

            if 'name' in jdata:
                story_json['title'] = jdata['name']

            for key in jdata:

                if key != 'seed_metadata':
                    story_json['metadata'][key] = jdata[key]

    if args.title is None:
        if args.collection_metadata_filename is None:
            logger.critical("Cannot continue, either supply a title with --title or a collection metadata file containing a title with --collection_metadata")
            sys.exit(255)
        else:
            # if we get here, the title should already be set
            pass
    else:
        story_json['title'] = args.title

        if args.title == "Archive-It Collection":

            if 'id' in jdata:
                story_json['title'] = args.title + " " + jdata['id']

            if 'name' in jdata:
                story_json['title'] = story_json['title'] + ': ' + jdata['name']


    story_json['elements'] = []

    if args.imagedata_filename is not None:
        with open(args.imagedata_filename) as f:
            jdata = json.load(f)
            story_json['story image'] = sorted(jdata['ranked data'], reverse=True)[0][-1]

    if args.termdata_filename is not None:
        import csv
        with open(args.termdata_filename) as f:
            reader = csv.DictReader(f, delimiter='\t')
            tf = []
            for row in reader:
                tf.append( ( int(row['Frequency in Corpus']), row['Term'] ) )

            story_json.setdefault('metadata', {})
            story_json['metadata']['terms'] = {}

            for term in sorted(tf, reverse=True)[0:args.term_count]:
                # story_json['metadata']['terms'].append(term[1])
                story_json['metadata'].setdefault('terms', {})
                story_json['metadata']['terms'][term[1]] = term[0]

    if args.entitydata_filename is not None:
        import csv
        with open(args.entitydata_filename) as f:
            reader = csv.DictReader(f, delimiter='\t')
            tf = []
            for row in reader:

                try:
                    tf.append( ( float(row['Corpus TF-IDF']), row['Entity'] ) )
                except TypeError:
                    logger.exception("row caused type error, skipping: {}".format(row))

            story_json.setdefault('metadata', {})
            story_json['metadata']['entities'] = {}

            for entity in sorted(tf, reverse=True)[0:args.entity_count]:
                # story_json['metadata']['entities'].append(entity[1])
                story_json['metadata'].setdefault('entities', {})
                story_json['metadata']['entities'][entity[1]] = entity[0]

    for urim in urimdata.keys():

        story_element = {
            "type": "link",
            "value": urim
        }

        story_json['elements'].append(story_element)

    for filename in args.extra_data:
        with open(filename) as f:
            edata = json.load(f)
            fname = filename.rsplit('.', 1)[0]
            story_json.setdefault('extra', {})
            story_json['extra'][fname] = edata

    logger.info("Writing Raintale JSON out to {}".format(
        args.output_filename
    ))

    with open(args.output_filename, 'w') as f:
        json.dump(story_json, f, indent=4)

    logger.info("Done generating Raintale JSON output at {}".format(args.output_filename))
Ejemplo n.º 20
0
def run_sample_with_dsa1(parser, args):

    from sys import platform
    import errno

    if platform == "win32":
        print(
            "Error: AlNoamany's Algorithm can only be executed via `hc sample` on Linux or macOS. Please see documentation for how to execute it on Windows and submit an issue to our Issue Tracker if you need Windows support."
        )
        sys.exit(errno.ENOTSUP)

    import argparse
    import subprocess
    import os
    import shlex
    from datetime import datetime
    from hypercane.actions import add_input_args, add_default_args
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session, save_resource_data
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type

    parser = add_input_args(parser)

    parser = add_default_args(parser)

    runtime_string = "{}".format(datetime.now()).replace(' ', 'T')

    parser.add_argument(
        '--working-directory',
        required=False,
        help="the directory to which this application should write output",
        default="/tmp/hypercane/working/{}".format(runtime_string),
        dest='working_directory')

    args = parser.parse_args(args)

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    logger.info(
        "Executing DSA1 (AlNoamany's) algorithm with working directory {}".
        format(args.working_directory))

    os.makedirs(args.working_directory, exist_ok=True)

    scriptdir = os.path.dirname(os.path.realpath(__file__))

    algorithm_script = "{}/../packaged_algorithms/dsa1.sh".format(scriptdir)

    logger.info("executing algorithm script from {}".format(algorithm_script))

    if type(args.logfile) != str:
        args.logfile = ""

    cp = subprocess.run([
        "/bin/bash", algorithm_script, args.input_type, args.input_arguments,
        args.cache_storage, args.working_directory, args.output_filename,
        args.logfile
    ])

    if cp.returncode != 0:
        logger.critical(
            "An error was encountered while executing DSA1 (AlNoamany's) algorithm"
        )
    else:
        logger.info("Done executing DSA1 (AlNoamany's) algorithm")

    return args
Ejemplo n.º 21
0
def run_sample_with(parser, args, algorithm_name, algorithm_script):

    from sys import platform
    import errno

    if platform == "win32":
        print(
            "Error: AlNoamany's Algorithm can only be executed via `hc sample` on Linux or macOS. Please see documentation for how to execute it on Windows and submit an issue to our Issue Tracker if you need Windows support."
        )
        sys.exit(errno.ENOTSUP)

    import argparse
    import subprocess
    import os
    import shlex
    from datetime import datetime
    from hypercane.actions import add_input_args, add_default_args
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session, save_resource_data
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type

    parser = add_input_args(parser)

    parser = add_default_args(parser)

    runtime_string = "{}".format(datetime.now()).replace(' ', 'T')

    parser.add_argument(
        '--working-directory',
        required=False,
        help="the directory to which this application should write output",
        default="/tmp/hypercane/working/{}".format(runtime_string),
        dest='working_directory')

    args = parser.parse_args(args)

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    logger.info("Executing the {} algorithm with working directory {}".format(
        algorithm_name, args.working_directory))

    logger.info("Using cache storage of '{}'".format(args.cache_storage))

    os.makedirs(args.working_directory, exist_ok=True)

    logger.info("executing algorithm script from {}".format(algorithm_script))

    logger.info("args: {}".format(args))

    if type(args.logfile) != str:
        args.logfile = ""

    if args.errorfilename is None:
        args.errorfilename = ""

    other_arglist = []

    for argname, argvalue in vars(args).items():
        if argname not in [
                'input_type', 'input_arguments', 'cache_storage',
                'working_directory', 'output_filename', 'logfile',
                'errorfilename'
        ]:
            if argvalue is not False:
                other_arglist.append("--{} {}".format(
                    argname.replace('_', '-'), argvalue))

    other_args = '"' + " ".join(other_arglist) + '"'

    logger.info("using other arguments: {}".format(other_args))

    cp = subprocess.run([
        "/bin/bash", algorithm_script, args.input_type, args.input_arguments,
        args.cache_storage, args.working_directory, args.output_filename,
        args.logfile, args.errorfilename, other_args
    ])

    if cp.returncode != 0:
        logger.critical(
            "An error was encountered while executing the {} algorithm".format(
                algorithm_name))
    else:
        logger.info("Done executing the {} algorithm".format(algorithm_name))

    return args
Ejemplo n.º 22
0
def dsa1_scoring(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.score.dsa1_ranking import rank_by_dsa1_score

    parser = argparse.ArgumentParser(
        description="Score the input using the DSA1 scoring equation.",
        prog="hc score dsa1-scoring")

    parser.add_argument(
        '--memento-damage-url',
        dest='memento_damage_url',
        default=None,
        help="The URL of the Memento-Damage service to use for scoring.")

    parser.add_argument(
        '--damage-weight',
        dest='damage_weight',
        default=-0.40,
        type=float,
        help="The weight for the Memento-Damage score in the scoring.")

    parser.add_argument(
        '--category-weight',
        dest='category_weight',
        default=0.15,
        type=float,
        help="The weight for the URI-R category score in the scoring.")

    parser.add_argument(
        '--path-depth-weight',
        dest='path_depth_weight',
        default=0.45,
        type=float,
        help="The weight for the URI-R path depth score in the scoring.")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Beginning the scoring by DSA1 scoring equation")

    if args.input_type == "mementos":
        urimdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)
    else:
        # TODO: derive URI-Ms from input type
        raise NotImplementedError(
            "Input type of {} not yet supported for scoring".format(
                args.input_type))

    urimdata = rank_by_dsa1_score(urimdata,
                                  session,
                                  memento_damage_url=args.memento_damage_url,
                                  damage_weight=args.damage_weight,
                                  category_weight=args.category_weight,
                                  path_depth_weight=args.path_depth_weight)

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info(
        "Finished ranking by DSA1 scoring equation, output is at {}".format(
            args.output_filename))
Ejemplo n.º 23
0
def report_metadatastats(args):

    import sys

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type, discover_original_resources_by_input_type

    from hypercane.report.metadatastats import get_pct_seeds_with_metadata, \
        get_pct_seeds_with_specific_field, get_pct_seeds_with_title, \
        get_pct_seeds_with_description, get_mean_default_field_score, \
        get_metadata_compression_ratio, get_mean_raw_field_count

    import json

    parser = argparse.ArgumentParser(
        description=
        "Discover the collection metadata in a web archive collection. Only Archive-It is supported at this time.",
        prog="hc report metadata")

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection metadata statistics run.")

    output = {}

    if args.input_type == 'archiveit':
        metadata = generate_collection_metadata(args.input_arguments, session)
        output['id'] = metadata['id']
        output['archived since'] = metadata['archived_since']
        output['# of seeds'] = len(metadata['seed_metadata']['seeds'])
        output['% of seeds with any metadata'] = get_pct_seeds_with_metadata(
            metadata)
        output['% title field use'] = get_pct_seeds_with_title(metadata)
        output['% description field use'] = get_pct_seeds_with_description(
            metadata)
        output[
            'mean default fields metadata score'] = get_mean_default_field_score(
                metadata)
        output[
            'mean non-normalized metadata count'] = get_mean_raw_field_count(
                metadata)
        output['metadata compression ratio'] = get_metadata_compression_ratio(
            metadata)
    else:
        logger.critical(
            "Metadata statistics are only supported for Archive-It collections"
        )
        sys.exit(255)

    with open(args.output_filename, 'w') as report_file:
        json.dump(output, report_file, indent=4)

    logger.info("Done with collection metadata discovery run.")
Ejemplo n.º 24
0
def discover_mementos(args):

    from hypercane.actions import process_input_args, get_logger, \
            calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    parser = argparse.ArgumentParser(
        description="Discover the mementos in a web archive collection.",
        prog="hc identify mementos")

    parser.add_argument(
        '--accept-datetime',
        '--desired-datetime',
        default=None,
        required=False,
        dest='accept_datetime',
        help='(only for original resource input type)\n'
        'discover mementos closest to this datetime in YYYY-mm-ddTHH:MM:SS format',
        type=lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S'))

    parser.add_argument(
        '--timegates',
        default=[
            "https://timetravel.mementoweb.org/timegate/",
            "https://web.archive.org/web/"
        ],
        required=False,
        dest='timegates',
        help='(only for original resource input type)\n'
        'use the given TimeGate endpoints to discover mementos',
        type=lambda s: [i.strip() for i in s.split(',')])

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting memento discovery run.")

    logger.info("Using {} for cache storage".format(args.cache_storage))

    urimdata = discover_resource_data_by_input_type(
        args.input_type,
        output_type,
        args.input_arguments,
        args.crawl_depth,
        session,
        discover_mementos_by_input_type,
        accept_datetime=args.accept_datetime,
        timegates=args.timegates)

    logger.info(
        "discovered {} mementos, preparing to write the list to {}".format(
            len(urimdata), args.output_filename))

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Done with memento discovery run. Output is in {}".format(
        args.output_filename))
Ejemplo n.º 25
0
def report_image_data(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type, discover_original_resources_by_input_type

    from hypercane.report.imagedata import generate_image_data, \
        rank_images, output_image_data_as_jsonl

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report on the images from in the mementos discovered in the input.",
        prog="hc report image-data")

    parser.add_argument(
        '--use-urirs',
        required=False,
        dest='use_urirs',
        action='store_true',
        help=
        "Regardless of headers, assume the input are URI-Rs and do not try to archive or convert them to URI-Ms."
    )

    parser.add_argument(
        '--output-format',
        required=False,
        dest="output_format",
        default="json",
        help="Choose the output format, valid formats are JSON and JSONL")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection image data run")

    if args.use_urirs == True:
        uridata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session,
            discover_original_resources_by_input_type)
    else:
        uridata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)

    if args.output_format == 'json':

        metadata = {}
        metadata['image data'] = generate_image_data(uridata,
                                                     args.cache_storage)
        metadata['ranked data'] = rank_images(metadata['image data'])

        with open(args.output_filename, 'w') as metadata_file:
            json.dump(metadata, metadata_file, indent=4)

    elif args.output_format == 'jsonl':
        output_image_data_as_jsonl(uridata, args.output_filename,
                                   args.cache_storage)

    logger.info("Done with collection image data run, output is at {}".format(
        args.output_filename))
Ejemplo n.º 26
0
def report_ranked_terms(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing the terms from the collection and their associated frequencies.",
        prog="hc report terms")

    parser.add_argument('--ngram-length',
                        help="The size of the n-grams",
                        dest='ngram_length',
                        default=1,
                        type=int)

    parser.add_argument(
        '--sumgrams',
        '--use-sumgrams',
        help="If specified, generate sumgrams rather than n-grams.",
        action='store_true',
        default=False,
        dest='use_sumgrams')

    parser.add_argument('--added-stopwords',
                        help="If specified, add stopwords from this file.",
                        dest='added_stopword_filename',
                        default=None)

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection image data run")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    added_stopwords = []

    if args.added_stopword_filename is not None:
        with open(args.added_stopword_filename) as f:
            for line in f:
                added_stopwords.append(line.strip())

    if args.use_sumgrams is True:

        from hypercane.report.sumgrams import generate_sumgrams
        from hypercane import package_directory

        ranked_terms = generate_sumgrams(list(urimdata.keys()),
                                         args.cache_storage,
                                         added_stopwords=added_stopwords)

        with open(args.output_filename, 'w') as f:

            f.write("Term\tFrequency in Corpus\tTerm Rate\n")

            for term, frequency, term_rate in ranked_terms:
                f.write("{}\t{}\t{}\n".format(term, frequency, term_rate))

    else:
        from hypercane.report.terms import generate_ranked_terms

        ranked_terms = generate_ranked_terms(list(urimdata.keys()),
                                             args.cache_storage,
                                             ngram_length=args.ngram_length,
                                             added_stopwords=added_stopwords)

        with open(args.output_filename, 'w') as f:

            f.write(
                "Term\tFrequency in Corpus\tProbability in Corpus\tDocument Frequency\tInverse Document Frequency\tCorpus TF-IDF\n"
            )

            for term, frequency, probability, df, idf, tfidf in ranked_terms:
                f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    term, frequency, probability, df, idf, tfidf))

    logger.info(
        "Done with collection term frequency report, output is in {}".format(
            args.output_filename))
Ejemplo n.º 27
0
def combine_files(args):

    import argparse
    import json
    import csv

    from hypercane.actions import get_logger, calculate_loglevel, \
        process_input_args

    parser = argparse.ArgumentParser(
        description="Combine the output from several Hypercane commands into one TSV file.",
        prog="hc synthesize combine"
    )

    parser.add_argument('--append-files', dest='append_files',
        help='the Hypercane files to append to the file specified by the -a command',
        nargs='*'
    )

    args = process_input_args(args, parser)

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting combination of files from input")

    if args.input_type == 'archiveit':
        msg = "Input type archiveit not yet implemented, choose mementos, timemaps, or orignal-resources instead"
        logger.exception(msg)
        raise NotImplementedError(msg)

    allfiles = []
    allfiles.append( args.input_arguments )
    allfiles.extend( args.append_files )

    fieldnames = []

    for filename in allfiles:

        with open(filename) as g:

            csvreader = csv.reader(g, delimiter='\t')
            fieldnames.extend( next(csvreader) )

    logger.info("detected fieldnames: {}".format(fieldnames))

    firstfield = None

    for input_field in ['URI-M', 'URI-T', 'URI-R']:

        input_field_count = fieldnames.count(input_field)

        if input_field_count == len(allfiles):
            if input_field_count > 0:
                firstfield = input_field
                break
        else:
            msg = "All input files must contain the same input type, either mementos, timemaps, or original-resources"
            logger.critical(msg)
            raise RuntimeError(msg)

    output_fieldnames = list(set(fieldnames))
    output_fieldnames.remove(firstfield)
    output_fieldnames.insert(0, firstfield)

    with open(args.output_filename, 'w') as f:

        writer = csv.DictWriter(f, delimiter='\t', fieldnames=output_fieldnames)
        writer.writeheader()

        for filename in allfiles:

            with open(filename) as g:

                csvreader = csv.DictReader(g, delimiter='\t')

                for row in csvreader:

                    outputrow = {}
                    outputrow[firstfield] = row[firstfield]

                    for fieldname in output_fieldnames:

                        try:
                            outputrow[fieldname] = row[fieldname]
                        except KeyError:
                            outputrow[fieldname] = None

                    writer.writerow(outputrow)

    logger.info("Writing new file to {}".format(args.output_filename))
Ejemplo n.º 28
0
def report_entities(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.report.entities import generate_entities

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing the terms from the collection and their associated frequencies.",
        prog="hc report entities")

    default_entity_types = [
        'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
        'WORK_OF_ART', 'LAW'
    ]

    parser.add_argument(
        '--entity-types',
        help=
        "The types of entities to report, from https://spacy.io/api/annotation#named-entities",
        dest='entity_types',
        default=default_entity_types,
        type=int)

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection image data run")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    ranked_terms = generate_entities(list(urimdata.keys()), args.cache_storage,
                                     args.entity_types)

    with open(args.output_filename, 'w') as f:

        f.write(
            "Entity\tFrequency in Corpus\tProbability in Corpus\tDocument Frequency\tInverse Document Frequency\tCorpus TF-IDF\n"
        )

        for term, frequency, probability, df, idf, tfidf in ranked_terms:
            f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(term, frequency,
                                                      probability, df, idf,
                                                      tfidf))

    logger.info(
        "Done with collection term frequency report, output is in {}".format(
            args.output_filename))
Ejemplo n.º 29
0
def synthesize_bpfree_files(args):

    import os
    import argparse
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session, get_boilerplate_free_content
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hashlib import md5
    import otmt
    from justext import justext, get_stoplist
    import traceback

    parser = argparse.ArgumentParser(
        description="Save boilerplate-free copies of mementos as files from a web archive collection.",
        prog="hc synthesize bpfree-files"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting generation of boilerplate-free files from input")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("discovered {} URI-Ms from the input".format(len(urimdata)))

    if not os.path.exists(args.output_directory):
        logger.info("Output directory {} does not exist, creating...".format(args.output_directory))
        os.makedirs(args.output_directory)

    # TODO: make this multithreaded
    with open("{}/metadata.tsv".format(args.output_directory), 'w') as metadatafile:

        for urim in urimdata.keys():

            try:

                bpfree = get_boilerplate_free_content(urim, cache_storage=args.cache_storage)

                m = md5()
                m.update(urim.encode('utf8'))
                urlhash = m.hexdigest()
                newfilename = urlhash + '.dat'

                logger.info("writing out data for URI-M {}".format(urim))
                with open("{}/{}".format(
                    args.output_directory, newfilename), 'wb') as newfile:
                    newfile.write(bpfree)

                metadatafile.write("{}\t{}\n".format(urim, newfilename))

            except Exception as exc:
                logger.exception('URI-M [{}] generated an exception: [{}], skipping...'.format(urim, repr(exc)))
                hypercane.errors.errorstore.add(urim, traceback.format_exc())

    logger.info("Done generating directory of boilerplate-free files, output is at {}".format(args.output_directory))
Ejemplo n.º 30
0
def report_growth_curve_stats(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type

    from hypercane.report.growth import get_last_memento_datetime, \
        get_first_memento_datetime, process_timemaps_for_mementos, \
        calculate_mementos_per_seed, calculate_memento_seed_ratio, \
        calculate_number_of_mementos, parse_data_for_mementos_list, \
        convert_mementos_list_into_mdts_pct_urim_pct_and_urir_pct, \
        draw_both_axes_pct_growth

    import json

    from sklearn.metrics import auc

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing statistics growth of mementos derived from the input.",
        prog="hc report growth")

    parser.add_argument(
        '--growth-curve-file',
        dest='growthcurve_filename',
        help=
        "If present, draw a growth curve and write it to the filename specified.",
        default=None,
        required=False)

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection original resource statistics run")

    urits = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_timemaps_by_input_type)

    timemap_data, errors_data = process_timemaps_for_mementos(urits, session)
    mementos_list = parse_data_for_mementos_list(timemap_data)
    mdts_pct, urims_pct, urirs_pct = \
        convert_mementos_list_into_mdts_pct_urim_pct_and_urir_pct(
        mementos_list)

    output = {}
    output['auc_memento_curve'] = auc(mdts_pct, urims_pct)
    output['auc_seed_curve'] = auc(mdts_pct, urirs_pct)
    output['auc_memento_minus_diag'] = output['auc_memento_curve'] - 0.5
    output['auc_seed_minus_diag'] = output['auc_seed_curve'] - 0.5
    output['auc_seed_minus_auc_memento'] = output['auc_seed_curve'] - output[
        'auc_memento_curve']
    output['memento_seed_ratio'] = calculate_memento_seed_ratio(timemap_data)
    output['mementos_per_seed'] = calculate_mementos_per_seed(timemap_data)
    output['first_memento_datetime'] = get_first_memento_datetime(timemap_data)
    output['last_memento_datetime'] = get_last_memento_datetime(timemap_data)
    output["number_of_original_resources"] = len(urits)
    output["number_of_mementos"] = calculate_number_of_mementos(timemap_data)
    output['lifespan_secs'] = (
        get_last_memento_datetime(timemap_data) -
        get_first_memento_datetime(timemap_data)).total_seconds()
    output['lifespan_mins'] = output['lifespan_secs'] / 60
    output['lifespan_hours'] = output['lifespan_secs'] / 60 / 60
    output['lifespan_days'] = output['lifespan_secs'] / 60 / 60 / 24
    output['lifespan_weeks'] = output['lifespan_secs'] / 60 / 60 / 24 / 7
    output['lifespan_years'] = output['lifespan_secs'] / 60 / 60 / 24 / 365

    with open(args.output_filename, 'w') as report_file:
        json.dump(output, report_file, indent=4, default=dtconverter)

    logger.info(
        "Done with collection growth statistics, report saved to {}".format(
            args.output_filename))

    if args.growthcurve_filename is not None:

        logger.info("Beginning to render collection growth curve...")

        draw_both_axes_pct_growth(mdts_pct, urims_pct, urirs_pct,
                                  args.growthcurve_filename)

        logger.info("Growth curve saved to {}".format(
            args.growthcurve_filename))