Beispiel #1
0
def discover_timemaps(args):

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type

    parser = argparse.ArgumentParser(
        description="Discover the timemaps in a web archive collection.",
        prog="hc identify timemaps")

    args = process_input_args(args, parser)
    output_type = 'timemaps'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting timemap discovery run.")
    logger.info("Using {} for cache storage".format(args.cache_storage))

    uritdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_timemaps_by_input_type)

    save_resource_data(args.output_filename, uritdata, 'timemaps',
                       list(uritdata.keys()))

    logger.info("Done with timemap discovery run. Output is in {}".format(
        args.output_filename))
Beispiel #2
0
def start_language_processing(parser, args):

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    parser.add_argument('--lang', '--languages', dest='languages',
        help="The list of languages to match, separated by commas.",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting filtering of mementos by languages...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    return args, logger, urimdata
Beispiel #3
0
def exclude_rank(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.utils import get_web_session
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos containing a score meeting the given criteria.",
        prog="hc filter include-only score"
    )

    parser.add_argument('--criteria', default=1, dest='criteria',
        help="The numeric criteria to use when selecting which values to keep."
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of documents meeting the criteria for score ...")

    session = get_web_session(cache_storage=args.cache_storage)

    # TODO: add a note about no crawling for this filter
    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    rankkey = extract_rank_key_from_input(urimdata)

    filtered_urims = []

    for urim in urimdata:
        if not eval("{}{}".format(
            urimdata[urim][rankkey], args.criteria
            )):
            filtered_urims.append(urim)

    logger.info("Saving {} filtered URI-Ms to {}".format(
        len(filtered_urims), args.output_filename))

    save_resource_data(
        args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Done filtering mementos by scor, output is saved to {}".format(
        args.output_filename
    ))
Beispiel #4
0
def pubdate_else_memento_datetime(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.order.dsa1_publication_alg import order_by_dsa1_publication_alg

    parser = argparse.ArgumentParser(
        description=
        "Order by publication date first, fall back to memento-datetime.",
        prog="hc order pubdate_else_memento_datetime")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    logger.info(
        "Starting ordering of the documents by the DSA1 publication algorithm..."
    )

    session = get_web_session(cache_storage=args.cache_storage)

    if args.input_type == "mementos":
        # urims = extract_uris_from_input(args.input_arguments)
        urimdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)
    else:
        # TODO: derive URI-Ms from input type
        raise NotImplementedError(
            "Input type of {} not yet supported for ordering".format(
                args.input_type))

    logger.info("extracted {} mementos from input".format(len(
        urimdata.keys())))

    ordered_urims = order_by_dsa1_publication_alg(list(urimdata.keys()),
                                                  args.cache_storage)

    logger.info("placed {} mementos in order".format(len(ordered_urims)))

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Finished ordering documents, output is at {}".format(
        args.output_filename))
Beispiel #5
0
def synthesize_warcs(args):

    import argparse
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from warcio.warcwriter import WARCWriter
    from warcio.statusandheaders import StatusAndHeaders
    import os
    from datetime import datetime
    import otmt
    from hashlib import md5
    import traceback

    parser = argparse.ArgumentParser(
        description="Create WARCs from the mementos in a web archive collection.",
        prog="hc synthesize files"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting generation of files from input")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("discovered {} URI-Ms from the input".format(len(urimdata)))

    if not os.path.exists(args.output_directory):
        logger.info("Output directory {} does not exist, creating...".format(args.output_directory))
        os.makedirs(args.output_directory)

    from hypercane.synthesize.warcs import synthesize_warc

    # TODO: make this multithreaded
    for urim in urimdata.keys():
        try:
            synthesize_warc(urim, session, args.output_directory)
        except Exception:
            logger.exception("failed to generate WARC for URI-M {}".format(urim))
            hypercane.errors.errorstore.add(urim, traceback.format_exc())

    logger.info("Done generating directory of files, output is at {}".format(args.output_directory))
Beispiel #6
0
def report_seedstats(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_original_resources_by_input_type

    from hypercane.report.seedstats import calculate_domain_diversity, \
        calculate_path_depth_diversity, most_frequent_seed_uri_path_depth, \
        calculate_top_level_path_percentage, calculate_percentage_querystring

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing statistics on the original-resources derived from the input.",
        prog="hc report seed-statistics")

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection original resource statistics run")

    urirs = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_original_resources_by_input_type)

    output = {}
    output['number of original-resources'] = len(urirs)
    output['domain diversity'] = calculate_domain_diversity(urirs)
    output['path depth diversity'] = calculate_path_depth_diversity(urirs)
    output['most frequent path depth'] = most_frequent_seed_uri_path_depth(
        urirs)
    output[
        'percentage of top-level URIs'] = calculate_top_level_path_percentage(
            urirs)
    output['query string percentage'] = calculate_percentage_querystring(urirs)

    with open(args.output_filename, 'w') as report_file:
        json.dump(output, report_file, indent=4)

    logger.info(
        "Done with collection original resource statistics report, output is in {}"
        .format(args.output_filename))
Beispiel #7
0
def remove_offtopic(parser, args):

    import argparse
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session
    from pymongo import MongoClient
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type, download_urits_and_extract_urims
    from hypercane.hfilter.remove_offtopic import detect_off_topic
    from hypercane.utils import save_resource_data

    args = process_remove_offtopic_args(args, parser)
    processing_type = 'timemaps'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of off-topic documents...")

    session = get_web_session(cache_storage=args.cache_storage)
    dbconn = MongoClient(args.cache_storage)

    if args.input_type == 'mementos':
        logger.warning(
            "Beware that an input type of 'mementos' may cause unexpected behavior. Specific mementos will be converted to TimeMaps and thus provide more mementos for consideration of off-topic analysis than were submitted."
        )

    uritdata = discover_resource_data_by_input_type(
        args.input_type, processing_type, args.input_arguments, args.crawl_depth,
        session, discover_timemaps_by_input_type
    )

    urits = list(uritdata.keys())
    urims = download_urits_and_extract_urims(urits, session)

    ontopic_mementos = detect_off_topic(
        dbconn, session, urits, urims, args.timemap_measures,
        num_topics=args.num_topics)

    logger.info("discovered {} on-topic mementos".format(len(ontopic_mementos)))

    # when reading in TimeMap URIs and writing out mementos, the urimdata will not match
    urimdata = {}
    for urim in ontopic_mementos:
        urimdata[urim] = {}

    save_resource_data(args.output_filename, urimdata, 'mementos', ontopic_mementos)

    logger.info("done with off-topic run, on-topic mementos are in {}".format(args.output_filename))
Beispiel #8
0
def time_slice(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.time_slice import execute_time_slice

    parser = argparse.ArgumentParser(
        description="Cluster the input into slices based on memento-datetime.",
        prog="hc cluster time-slice"
    )

    parser.add_argument('-k', dest='k',
        default=None, type=int,
        help='The number of clusters to create.'
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning time slicing of collection...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    urimdata_with_slices = execute_time_slice(
        urimdata, args.cache_storage, number_of_slices=args.k)

    # we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error
    save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys()))

    logger.info("finished time slicing, output is available at {}".format(args.output_filename))
Beispiel #9
0
def image_count_scoring(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.score.image_count import score_by_image_count

    parser = argparse.ArgumentParser(
        description=
        "Score the input using the number of images detected in each memento.",
        prog="hc score image-count")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Beginning the scoring by image count")

    if args.input_type == "mementos":
        urimdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)
    else:
        # TODO: derive URI-Ms from input type
        raise NotImplementedError(
            "Input type of {} not yet supported for scoring".format(
                args.input_type))

    logger.info("using session {}".format(session))
    logger.info("using cache storage: {}".format(args.cache_storage))

    urimdata = score_by_image_count(urimdata, session)

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Finished scoring by image count, output is at {}".format(
        args.output_filename))
Beispiel #10
0
def include_urir(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.containing_urir import filter_by_urir
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos with an original resource matching the given pattern.",
        prog="hc filter include-only containing-url-pattern"
    )

    parser.add_argument('--url-pattern', '--urir-pattern', dest='urir_pattern',
        help="The regular expression pattern of the URL to match (as Python regex)",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of mementos whose original resource URL matches pattern {}...".format(args.urir_pattern))

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    urims = list(urimdata.keys())

    filtered_urims = filter_by_urir(urims, args.cache_storage, args.urir_pattern)

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of mementos whose original resource URL matches pattern {}, output is in {}".format(
        args.urir_pattern, args.output_filename
    ))
Beispiel #11
0
def include_largest_clusters(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.largest_cluster import return_largest_clusters
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos from the largest clusters. Input must contain cluster information. If two clusters have the same size, the first listed in the input is returned.",
        prog="hc filter include-only largest-cluster"
    )

    parser.add_argument('--cluster-count', default=1, dest='cluster_count',
        help="The number of clusters' worth of mementos to returned, sorted descending by cluster size."
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of mementos in the largest cluster...")

    session = get_web_session(cache_storage=args.cache_storage)

    # TODO: add a note about no crawling for this filter
    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    filtered_urims = return_largest_clusters(urimdata, int(args.cluster_count))

    logger.info("returning largest cluster with {} mementos".format(len(filtered_urims)))

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of mementos in the largest cluster, output is in {}".format(
        args.output_filename
    ))
Beispiel #12
0
def discover_collection_metadata(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_original_resources_by_input_type

    import json

    parser = argparse.ArgumentParser(
        description=
        "Discover the collection metadata in a web archive collection. Only Archive-It is supported at this time.",
        prog="hc report metadata")

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection metadata discovery run.")

    if args.input_type == 'archiveit':
        metadata = generate_collection_metadata(args.input_arguments, session)
    else:
        logger.warning(
            "Metadata reports are only supported for Archive-It collections, proceeding to create JSON output for URI-Rs."
        )

        urirdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session,
            discover_original_resources_by_input_type)
        metadata = generate_blank_metadata(list(urirdata.keys()))

    with open(args.output_filename, 'w') as metadata_file:
        json.dump(metadata, metadata_file, indent=4)

    logger.info("Done with collection metadata discovery run.")
Beispiel #13
0
def include_highest_score_per_cluster(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.highest_rank_per_cluster import return_highest_ranking_memento_per_cluster
    from hypercane.utils import save_resource_data

    parser = argparse.ArgumentParser(
        description="Include only mementos with the highest score from each cluster.",
        prog="hc filter include-only highest-score-per-cluster"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of mementos with the highest score in each cluster...")

    session = get_web_session(cache_storage=args.cache_storage)

    # TODO: add a note about no crawling for this filter
    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, 1,
        session, discover_mementos_by_input_type
    )

    rankkey = extract_rank_key_from_input(urimdata)

    logger.info("using score key {}".format(rankkey))

    filtered_urims = return_highest_ranking_memento_per_cluster(urimdata, rankkey)

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of mementos with the highest score in each cluster, output is in {}".format(
        args.output_filename
    ))
Beispiel #14
0
def cluster_by_urir(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.original_resource import cluster_by_urir

    parser = argparse.ArgumentParser(
        description="Cluster the input based on domain name.",
        prog="hc cluster domainname"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning original resource URI clustering of collection...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    urimdata_with_clusters = cluster_by_urir(urimdata, args.cache_storage)

    # we use urimdata and urimdata_with_clusters because they should match, if they don't we will detect an error
    save_resource_data(args.output_filename, urimdata_with_clusters, 'mementos', list(urimdata.keys()))

    logger.info("finished clustering by original resource URI, output is available at {}".format(args.output_filename))
Beispiel #15
0
def bm25_ranking(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.score.bm25 import rank_by_bm25

    parser = argparse.ArgumentParser(
        description="Score the input using a query and the BM25 algorithm.",
        prog="hc score bm25")

    parser.add_argument('--query',
                        dest='query',
                        required=True,
                        help="The query to use with BM25")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Beginning the scoring by BM25")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    urimdata = rank_by_bm25(urimdata, session, args.query, args.cache_storage)

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Finished scoring by BM25, output is at {}".format(
        args.output_filename))
Beispiel #16
0
def start_containing_pattern(parser, args, include):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import save_resource_data, get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.containing_pattern import filter_pattern

    parser.add_argument('--pattern', dest='pattern_string',
        help="The regular expression pattern to match (as Python regex)",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    session = get_web_session(cache_storage=args.cache_storage)

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting filter of mementos containing pattern...")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    urims = list(urimdata.keys())

    filtered_urims = filter_pattern(
        urims, args.cache_storage, args.pattern, include
    )

    save_resource_data(
        args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("done filtering mementos by pattern, output is in {}".format(args.output_filename))
Beispiel #17
0
def remove_near_duplicates(parser, args):

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.near_duplicates import filter_near_duplicates
    from hypercane.utils import save_resource_data

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting detection of near-duplicate mementos...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.debug("urimdata: {}".format(urimdata))

    urims = list(urimdata.keys())

    filtered_urims = filter_near_duplicates(urims, args.cache_storage)

    logger.info("writing {} to {}".format(filtered_urims, args.output_filename))

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("Completed detection of near-duplicates, output is saved to {}".format(args.output_filename))
Beispiel #18
0
def sample_with_true_random(args):

    from hypercane.sample.true_random import select_true_random
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session, save_resource_data
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    args = sample_with_true_random_args(args)

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    if args.errorfilename is not None:
        hypercane.errors.errorstore.type = hypercane.errors.FileErrorStore(
            args.errorfilename)

    session = get_web_session(cache_storage=args.cache_storage)
    output_type = 'mementos'

    logger.info("Starting random sampling of URI-Ms.")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    logger.info("Executing select true random algorithm")
    sampled_urims = select_true_random(list(urimdata.keys()),
                                       int(args.sample_count))

    logger.info("Writing sampled URI-Ms out to {}".format(
        args.output_filename))
    save_resource_data(args.output_filename, urimdata, 'original-resources',
                       sampled_urims)

    logger.info("Done sampling.")
Beispiel #19
0
def cluster_by_dbscan(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data, \
        get_raw_simhash, get_tf_simhash

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.dbscan import cluster_by_simhash_distance, \
        cluster_by_memento_datetime

    parser = argparse.ArgumentParser(
        description="Cluster the input using the dbscan algorithm.",
        prog="hc cluster dbscan"
    )

    parser.add_argument('--feature', dest='feature',
        default='tf-simhash',
        help='The feature in which to cluster the documents.'
    )

    parser.add_argument('--eps', dest='eps',
        default=0.5,
        help='The maximum distance between two samples for one to be considered as in the neighbordhood of the other. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html'
    )

    parser.add_argument('--min-samples', dest='min_samples',
        default=5,
        help="The number of samples in a neighbordhood for a point to be considered as a core point. See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning the clustering of the collection by dbscan...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    if args.feature == "raw-simhash":
        logger.info("Clustering URI-Ms by Raw Simhash")
        urimdata = cluster_by_simhash_distance(
            urimdata, args.cache_storage,
            simhash_function=get_raw_simhash,
            min_samples=int(args.min_samples),
            eps=float(args.eps))

    elif args.feature == "tf-simhash":
        logger.info("Clustering URI-Ms by Term Frequency Simhash")
        urimdata = cluster_by_simhash_distance(
            urimdata, args.cache_storage,
            simhash_function=get_tf_simhash,
            min_samples=int(args.min_samples),
            eps=float(args.eps))

    elif args.feature == "memento-datetime":
        logger.info("Clustering URI-Ms by Memento-Datetime")
        urimdata = cluster_by_memento_datetime(
            urimdata, args.cache_storage,
            min_samples=int(args.min_samples),
            eps=float(args.eps))

    else:
        raise NotImplementedError("Clustering feature of {} not yet supported.".format(args.feature))

    save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys()))

    logger.info("Clustering of collection via DBSCAN on feature {} is complete, output is in {}".format(args.feature, args.output_filename))
Beispiel #20
0
def dsa1_scoring(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.score.dsa1_ranking import rank_by_dsa1_score

    parser = argparse.ArgumentParser(
        description="Score the input using the DSA1 scoring equation.",
        prog="hc score dsa1-scoring")

    parser.add_argument(
        '--memento-damage-url',
        dest='memento_damage_url',
        default=None,
        help="The URL of the Memento-Damage service to use for scoring.")

    parser.add_argument(
        '--damage-weight',
        dest='damage_weight',
        default=-0.40,
        type=float,
        help="The weight for the Memento-Damage score in the scoring.")

    parser.add_argument(
        '--category-weight',
        dest='category_weight',
        default=0.15,
        type=float,
        help="The weight for the URI-R category score in the scoring.")

    parser.add_argument(
        '--path-depth-weight',
        dest='path_depth_weight',
        default=0.45,
        type=float,
        help="The weight for the URI-R path depth score in the scoring.")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Beginning the scoring by DSA1 scoring equation")

    if args.input_type == "mementos":
        urimdata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)
    else:
        # TODO: derive URI-Ms from input type
        raise NotImplementedError(
            "Input type of {} not yet supported for scoring".format(
                args.input_type))

    urimdata = rank_by_dsa1_score(urimdata,
                                  session,
                                  memento_damage_url=args.memento_damage_url,
                                  damage_weight=args.damage_weight,
                                  category_weight=args.category_weight,
                                  path_depth_weight=args.path_depth_weight)

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info(
        "Finished ranking by DSA1 scoring equation, output is at {}".format(
            args.output_filename))
Beispiel #21
0
def include_near_datetime(args):

    import argparse
    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hypercane.hfilter.near_datetime import filter_by_memento_datetime
    from hypercane.utils import save_resource_data
    from datetime import datetime

    parser = argparse.ArgumentParser(
        description="Include mementos whose memento-datetimes fall within the range of start-datetime and end-datetime.",
        prog="hc filter include-only near-datetime"
    )

    parser.add_argument('--start-datetime', '--lower-datetime',
        dest='lower_datetime',
        help="The lower bound datetime in YYYY-mm-ddTHH:MM:SS format.",
        required=True
    )

    parser.add_argument('--end-datetime', '--upper-datetime',
        dest='upper_datetime',
        help="The upper bound datetime in YYYY-mm-ddTHH:MM:SS format.",
        required=True
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Starting filtering of mementos by memento-datetime...")

    lower_datetime = datetime.strptime(
        args.lower_datetime,
        "%Y-%m-%dT%H:%M:%S"
    )

    upper_datetime = datetime.strptime(
        args.upper_datetime,
        "%Y-%m-%dT%H:%M:%S"
    )

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    urims = list(urimdata.keys())

    filtered_urims = filter_by_memento_datetime(
        urims, args.cache_storage, lower_datetime, upper_datetime)

    save_resource_data(args.output_filename, urimdata, 'mementos', filtered_urims)

    logger.info("done filtering mementos by memento-datetime, output is in {}".format(args.output_filename))
Beispiel #22
0
def cluster_by_lda(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.lda import cluster_with_lda

    parser = argparse.ArgumentParser(
        description="Cluster the input based on LDA topic modeling with gensim.",
        prog="hc cluster lda"
    )

    # TODO: add argument for top scoring cluster (default) or all of them

    parser.add_argument('--num_topics', dest='num_topics',
        default=20, required=False, type=int,
        help='The number of topics to cluster.'
    )

    parser.add_argument('--num_passes', dest='num_passes',
        default=2, required=False, type=int,
        help='The number of passes through the corpus during training. This corresponds to the Gensim LDA setting of the same name. See: https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html'
    )

    parser.add_argument('--num_iterations', dest='num_iterations',
        default=50, required=False, type=int,
        help='The number of iterations through each document during training. This corresponds to the Gensim LDA setting of the same name. See: https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html'
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning LDA clustering of collection...")

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    urimdata_with_slices = cluster_with_lda(urimdata, args.cache_storage, args.num_topics, args.num_iterations, args.num_passes)

    # we use urimdata and urimdata_with_slices because they should match, if they don't we will detect an error
    save_resource_data(args.output_filename, urimdata_with_slices, 'mementos', list(urimdata.keys()))

    logger.info("finished clustering with LDA, output is available at {}".format(args.output_filename))
Beispiel #23
0
def discover_mementos(args):

    from hypercane.actions import process_input_args, get_logger, \
            calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    parser = argparse.ArgumentParser(
        description="Discover the mementos in a web archive collection.",
        prog="hc identify mementos")

    parser.add_argument(
        '--accept-datetime',
        '--desired-datetime',
        default=None,
        required=False,
        dest='accept_datetime',
        help='(only for original resource input type)\n'
        'discover mementos closest to this datetime in YYYY-mm-ddTHH:MM:SS format',
        type=lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S'))

    parser.add_argument(
        '--timegates',
        default=[
            "https://timetravel.mementoweb.org/timegate/",
            "https://web.archive.org/web/"
        ],
        required=False,
        dest='timegates',
        help='(only for original resource input type)\n'
        'use the given TimeGate endpoints to discover mementos',
        type=lambda s: [i.strip() for i in s.split(',')])

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting memento discovery run.")

    logger.info("Using {} for cache storage".format(args.cache_storage))

    urimdata = discover_resource_data_by_input_type(
        args.input_type,
        output_type,
        args.input_arguments,
        args.crawl_depth,
        session,
        discover_mementos_by_input_type,
        accept_datetime=args.accept_datetime,
        timegates=args.timegates)

    logger.info(
        "discovered {} mementos, preparing to write the list to {}".format(
            len(urimdata), args.output_filename))

    save_resource_data(args.output_filename, urimdata, 'mementos',
                       list(urimdata.keys()))

    logger.info("Done with memento discovery run. Output is in {}".format(
        args.output_filename))
Beispiel #24
0
def cluster_by_kmeans(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session, save_resource_data, \
        get_raw_simhash, get_tf_simhash

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.cluster.kmeans import cluster_by_memento_datetime

    parser = argparse.ArgumentParser(
        description="Cluster the input using the dbscan algorithm.",
        prog="hc cluster kmeans"
    )

    parser.add_argument('--feature', dest='feature',
        default='memento-datetime',
        help='The feature in which to cluster the documents.'
    )

    parser.add_argument('-k', dest='k',
        default=28, type=int,
        help='The number of clusters to create.'
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    logger.info("Beginning the clustering of the collection by K-means with feature {}...".format(args.feature))

    session = get_web_session(cache_storage=args.cache_storage)

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("There were {} mementos discovered in the input".format(len(urimdata)))

    k = args.k

    if len(urimdata) < args.k:
        k = len(urimdata)

    if args.feature == 'memento-datetime':
        urimdata = cluster_by_memento_datetime(urimdata, args.cache_storage, k)
    else:
        raise NotImplementedError("Clustering feature of {} not yet supported.".format(args.feature))

    save_resource_data(args.output_filename, urimdata, 'mementos', list(urimdata.keys()))

    logger.info("Clustering of collection into {} clusters via K-means on feature {} is complete,"
        "output is available in {}".format(args.k, args.feature, args.output_filename))
Beispiel #25
0
def report_growth_curve_stats(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_timemaps_by_input_type

    from hypercane.report.growth import get_last_memento_datetime, \
        get_first_memento_datetime, process_timemaps_for_mementos, \
        calculate_mementos_per_seed, calculate_memento_seed_ratio, \
        calculate_number_of_mementos, parse_data_for_mementos_list, \
        convert_mementos_list_into_mdts_pct_urim_pct_and_urir_pct, \
        draw_both_axes_pct_growth

    import json

    from sklearn.metrics import auc

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing statistics growth of mementos derived from the input.",
        prog="hc report growth")

    parser.add_argument(
        '--growth-curve-file',
        dest='growthcurve_filename',
        help=
        "If present, draw a growth curve and write it to the filename specified.",
        default=None,
        required=False)

    args = process_input_args(args, parser)
    output_type = 'original-resources'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection original resource statistics run")

    urits = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_timemaps_by_input_type)

    timemap_data, errors_data = process_timemaps_for_mementos(urits, session)
    mementos_list = parse_data_for_mementos_list(timemap_data)
    mdts_pct, urims_pct, urirs_pct = \
        convert_mementos_list_into_mdts_pct_urim_pct_and_urir_pct(
        mementos_list)

    output = {}
    output['auc_memento_curve'] = auc(mdts_pct, urims_pct)
    output['auc_seed_curve'] = auc(mdts_pct, urirs_pct)
    output['auc_memento_minus_diag'] = output['auc_memento_curve'] - 0.5
    output['auc_seed_minus_diag'] = output['auc_seed_curve'] - 0.5
    output['auc_seed_minus_auc_memento'] = output['auc_seed_curve'] - output[
        'auc_memento_curve']
    output['memento_seed_ratio'] = calculate_memento_seed_ratio(timemap_data)
    output['mementos_per_seed'] = calculate_mementos_per_seed(timemap_data)
    output['first_memento_datetime'] = get_first_memento_datetime(timemap_data)
    output['last_memento_datetime'] = get_last_memento_datetime(timemap_data)
    output["number_of_original_resources"] = len(urits)
    output["number_of_mementos"] = calculate_number_of_mementos(timemap_data)
    output['lifespan_secs'] = (
        get_last_memento_datetime(timemap_data) -
        get_first_memento_datetime(timemap_data)).total_seconds()
    output['lifespan_mins'] = output['lifespan_secs'] / 60
    output['lifespan_hours'] = output['lifespan_secs'] / 60 / 60
    output['lifespan_days'] = output['lifespan_secs'] / 60 / 60 / 24
    output['lifespan_weeks'] = output['lifespan_secs'] / 60 / 60 / 24 / 7
    output['lifespan_years'] = output['lifespan_secs'] / 60 / 60 / 24 / 365

    with open(args.output_filename, 'w') as report_file:
        json.dump(output, report_file, indent=4, default=dtconverter)

    logger.info(
        "Done with collection growth statistics, report saved to {}".format(
            args.output_filename))

    if args.growthcurve_filename is not None:

        logger.info("Beginning to render collection growth curve...")

        draw_both_axes_pct_growth(mdts_pct, urims_pct, urirs_pct,
                                  args.growthcurve_filename)

        logger.info("Growth curve saved to {}".format(
            args.growthcurve_filename))
Beispiel #26
0
def report_entities(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    from hypercane.report.entities import generate_entities

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing the terms from the collection and their associated frequencies.",
        prog="hc report entities")

    default_entity_types = [
        'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
        'WORK_OF_ART', 'LAW'
    ]

    parser.add_argument(
        '--entity-types',
        help=
        "The types of entities to report, from https://spacy.io/api/annotation#named-entities",
        dest='entity_types',
        default=default_entity_types,
        type=int)

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection image data run")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    ranked_terms = generate_entities(list(urimdata.keys()), args.cache_storage,
                                     args.entity_types)

    with open(args.output_filename, 'w') as f:

        f.write(
            "Entity\tFrequency in Corpus\tProbability in Corpus\tDocument Frequency\tInverse Document Frequency\tCorpus TF-IDF\n"
        )

        for term, frequency, probability, df, idf, tfidf in ranked_terms:
            f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(term, frequency,
                                                      probability, df, idf,
                                                      tfidf))

    logger.info(
        "Done with collection term frequency report, output is in {}".format(
            args.output_filename))
Beispiel #27
0
def raintale_story(args):

    import argparse
    import json
    import hypercane.actions
    from hypercane.actions import get_logger, calculate_loglevel, \
        process_input_args
    from hypercane.utils import get_web_session
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    parser = argparse.ArgumentParser(
        description="Generate a story suitable as input to Raintale.",
        prog="hc synthesize raintale-story"
    )

    parser.add_argument('--title', dest='title',
        help='The title of the story', required=False, default=None
    )

    parser.add_argument('--imagedata', dest='imagedata_filename',
        help='A file containing image data, as produced by hc report image-data',
        required=False, default=None
    )

    parser.add_argument('--termdata', dest='termdata_filename',
        help='A file containing term data, as produced by hc report terms',
        required=False, default=None
    )

    parser.add_argument('--term-count', dest='term_count',
        help='The number of top terms to select from the term data.',
        required=False, default=5
    )

    parser.add_argument('--entitydata', dest='entitydata_filename',
        help='A file containing term data, as produced by hc report entities',
        required=False, default=None
    )

    parser.add_argument('--collection_metadata', dest='collection_metadata_filename',
        help='A file containing Archive-It colleciton metadata, as produced by hc report metadata',
        required=False, default=None
    )

    parser.add_argument('--entity-count', dest='entity_count',
        help='The number of top terms to select from the term data.',
        required=False, default=5
    )

    parser.add_argument('--extradata', dest='extra_data',
        help='a JSON file containing extra data that will be included in the Raintale JSON, '
        'multiple filenames may follow this argument, '
        'the name of the file without the extension will be the JSON key', nargs='*',
        default=[]
    )

    args = hypercane.actions.process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting generation of files from input")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("discovered {} URI-Ms from the input".format(len(urimdata)))

    story_json = {
        'metadata': {}
    }

    if args.collection_metadata_filename is not None:
        with open(args.collection_metadata_filename) as f:
            jdata = json.load(f)

            if 'name' in jdata:
                story_json['title'] = jdata['name']

            for key in jdata:

                if key != 'seed_metadata':
                    story_json['metadata'][key] = jdata[key]

    if args.title is None:
        if args.collection_metadata_filename is None:
            logger.critical("Cannot continue, either supply a title with --title or a collection metadata file containing a title with --collection_metadata")
            sys.exit(255)
        else:
            # if we get here, the title should already be set
            pass
    else:
        story_json['title'] = args.title

        if args.title == "Archive-It Collection":

            if 'id' in jdata:
                story_json['title'] = args.title + " " + jdata['id']

            if 'name' in jdata:
                story_json['title'] = story_json['title'] + ': ' + jdata['name']


    story_json['elements'] = []

    if args.imagedata_filename is not None:
        with open(args.imagedata_filename) as f:
            jdata = json.load(f)
            story_json['story image'] = sorted(jdata['ranked data'], reverse=True)[0][-1]

    if args.termdata_filename is not None:
        import csv
        with open(args.termdata_filename) as f:
            reader = csv.DictReader(f, delimiter='\t')
            tf = []
            for row in reader:
                tf.append( ( int(row['Frequency in Corpus']), row['Term'] ) )

            story_json.setdefault('metadata', {})
            story_json['metadata']['terms'] = {}

            for term in sorted(tf, reverse=True)[0:args.term_count]:
                # story_json['metadata']['terms'].append(term[1])
                story_json['metadata'].setdefault('terms', {})
                story_json['metadata']['terms'][term[1]] = term[0]

    if args.entitydata_filename is not None:
        import csv
        with open(args.entitydata_filename) as f:
            reader = csv.DictReader(f, delimiter='\t')
            tf = []
            for row in reader:

                try:
                    tf.append( ( float(row['Corpus TF-IDF']), row['Entity'] ) )
                except TypeError:
                    logger.exception("row caused type error, skipping: {}".format(row))

            story_json.setdefault('metadata', {})
            story_json['metadata']['entities'] = {}

            for entity in sorted(tf, reverse=True)[0:args.entity_count]:
                # story_json['metadata']['entities'].append(entity[1])
                story_json['metadata'].setdefault('entities', {})
                story_json['metadata']['entities'][entity[1]] = entity[0]

    for urim in urimdata.keys():

        story_element = {
            "type": "link",
            "value": urim
        }

        story_json['elements'].append(story_element)

    for filename in args.extra_data:
        with open(filename) as f:
            edata = json.load(f)
            fname = filename.rsplit('.', 1)[0]
            story_json.setdefault('extra', {})
            story_json['extra'][fname] = edata

    logger.info("Writing Raintale JSON out to {}".format(
        args.output_filename
    ))

    with open(args.output_filename, 'w') as f:
        json.dump(story_json, f, indent=4)

    logger.info("Done generating Raintale JSON output at {}".format(args.output_filename))
Beispiel #28
0
def report_image_data(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type, discover_original_resources_by_input_type

    from hypercane.report.imagedata import generate_image_data, \
        rank_images, output_image_data_as_jsonl

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report on the images from in the mementos discovered in the input.",
        prog="hc report image-data")

    parser.add_argument(
        '--use-urirs',
        required=False,
        dest='use_urirs',
        action='store_true',
        help=
        "Regardless of headers, assume the input are URI-Rs and do not try to archive or convert them to URI-Ms."
    )

    parser.add_argument(
        '--output-format',
        required=False,
        dest="output_format",
        default="json",
        help="Choose the output format, valid formats are JSON and JSONL")

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection image data run")

    if args.use_urirs == True:
        uridata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session,
            discover_original_resources_by_input_type)
    else:
        uridata = discover_resource_data_by_input_type(
            args.input_type, output_type, args.input_arguments,
            args.crawl_depth, session, discover_mementos_by_input_type)

    if args.output_format == 'json':

        metadata = {}
        metadata['image data'] = generate_image_data(uridata,
                                                     args.cache_storage)
        metadata['ranked data'] = rank_images(metadata['image data'])

        with open(args.output_filename, 'w') as metadata_file:
            json.dump(metadata, metadata_file, indent=4)

    elif args.output_format == 'jsonl':
        output_image_data_as_jsonl(uridata, args.output_filename,
                                   args.cache_storage)

    logger.info("Done with collection image data run, output is at {}".format(
        args.output_filename))
Beispiel #29
0
def synthesize_bpfree_files(args):

    import os
    import argparse
    from hypercane.actions import get_logger, calculate_loglevel
    from hypercane.utils import get_web_session, get_boilerplate_free_content
    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type
    from hashlib import md5
    import otmt
    from justext import justext, get_stoplist
    import traceback

    parser = argparse.ArgumentParser(
        description="Save boilerplate-free copies of mementos as files from a web archive collection.",
        prog="hc synthesize bpfree-files"
    )

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__,
        calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile
    )

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting generation of boilerplate-free files from input")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type
    )

    logger.info("discovered {} URI-Ms from the input".format(len(urimdata)))

    if not os.path.exists(args.output_directory):
        logger.info("Output directory {} does not exist, creating...".format(args.output_directory))
        os.makedirs(args.output_directory)

    # TODO: make this multithreaded
    with open("{}/metadata.tsv".format(args.output_directory), 'w') as metadatafile:

        for urim in urimdata.keys():

            try:

                bpfree = get_boilerplate_free_content(urim, cache_storage=args.cache_storage)

                m = md5()
                m.update(urim.encode('utf8'))
                urlhash = m.hexdigest()
                newfilename = urlhash + '.dat'

                logger.info("writing out data for URI-M {}".format(urim))
                with open("{}/{}".format(
                    args.output_directory, newfilename), 'wb') as newfile:
                    newfile.write(bpfree)

                metadatafile.write("{}\t{}\n".format(urim, newfilename))

            except Exception as exc:
                logger.exception('URI-M [{}] generated an exception: [{}], skipping...'.format(urim, repr(exc)))
                hypercane.errors.errorstore.add(urim, traceback.format_exc())

    logger.info("Done generating directory of boilerplate-free files, output is at {}".format(args.output_directory))
Beispiel #30
0
def report_ranked_terms(args):

    import argparse

    from hypercane.actions import process_input_args, get_logger, \
        calculate_loglevel

    from hypercane.utils import get_web_session

    from hypercane.identify import discover_resource_data_by_input_type, \
        discover_mementos_by_input_type

    import json

    parser = argparse.ArgumentParser(
        description=
        "Provide a report containing the terms from the collection and their associated frequencies.",
        prog="hc report terms")

    parser.add_argument('--ngram-length',
                        help="The size of the n-grams",
                        dest='ngram_length',
                        default=1,
                        type=int)

    parser.add_argument(
        '--sumgrams',
        '--use-sumgrams',
        help="If specified, generate sumgrams rather than n-grams.",
        action='store_true',
        default=False,
        dest='use_sumgrams')

    parser.add_argument('--added-stopwords',
                        help="If specified, add stopwords from this file.",
                        dest='added_stopword_filename',
                        default=None)

    args = process_input_args(args, parser)
    output_type = 'mementos'

    logger = get_logger(
        __name__, calculate_loglevel(verbose=args.verbose, quiet=args.quiet),
        args.logfile)

    session = get_web_session(cache_storage=args.cache_storage)

    logger.info("Starting collection image data run")

    urimdata = discover_resource_data_by_input_type(
        args.input_type, output_type, args.input_arguments, args.crawl_depth,
        session, discover_mementos_by_input_type)

    added_stopwords = []

    if args.added_stopword_filename is not None:
        with open(args.added_stopword_filename) as f:
            for line in f:
                added_stopwords.append(line.strip())

    if args.use_sumgrams is True:

        from hypercane.report.sumgrams import generate_sumgrams
        from hypercane import package_directory

        ranked_terms = generate_sumgrams(list(urimdata.keys()),
                                         args.cache_storage,
                                         added_stopwords=added_stopwords)

        with open(args.output_filename, 'w') as f:

            f.write("Term\tFrequency in Corpus\tTerm Rate\n")

            for term, frequency, term_rate in ranked_terms:
                f.write("{}\t{}\t{}\n".format(term, frequency, term_rate))

    else:
        from hypercane.report.terms import generate_ranked_terms

        ranked_terms = generate_ranked_terms(list(urimdata.keys()),
                                             args.cache_storage,
                                             ngram_length=args.ngram_length,
                                             added_stopwords=added_stopwords)

        with open(args.output_filename, 'w') as f:

            f.write(
                "Term\tFrequency in Corpus\tProbability in Corpus\tDocument Frequency\tInverse Document Frequency\tCorpus TF-IDF\n"
            )

            for term, frequency, probability, df, idf, tfidf in ranked_terms:
                f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    term, frequency, probability, df, idf, tfidf))

    logger.info(
        "Done with collection term frequency report, output is in {}".format(
            args.output_filename))