Beispiel #1
0
def ngram_intersection(args, parser):
    """Outputs the results of performing an intersection query."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args.catalogue)
    store.validate(corpus, catalogue)
    store.intersection(catalogue, sys.stdout)
Beispiel #2
0
def ngram_intersection(args, parser):
    """Outputs the results of performing an intersection query."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args)
    store.validate(corpus, catalogue)
    store.intersection(catalogue, sys.stdout)
Beispiel #3
0
def search_texts(args, parser):
    """Searches texts for presence of n-grams."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args)
    store.validate(corpus, catalogue)
    ngrams = utils.get_ngrams(args.ngrams)
    store.search(catalogue, ngrams, sys.stdout)
Beispiel #4
0
def generate_ngrams(args, parser):
    """Adds n-grams data to the data store."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    if args.catalogue:
        catalogue = utils.get_catalogue(args)
    else:
        catalogue = None
    store.add_ngrams(corpus, args.min_size, args.max_size, catalogue)
Beispiel #5
0
def generate_ngrams(args, parser):
    """Adds n-grams data to the data store."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    if args.catalogue:
        catalogue = utils.get_catalogue(args.catalogue)
    else:
        catalogue = None
    store.add_ngrams(corpus, args.min_size, args.max_size, catalogue)
Beispiel #6
0
def ngram_diff(args, parser):
    """Outputs the results of performing a diff query."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args.catalogue)
    tokenizer = utils.get_tokenizer(args)
    store.validate(corpus, catalogue)
    if args.asymmetric:
        store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout)
    else:
        store.diff(catalogue, tokenizer, sys.stdout)
Beispiel #7
0
def ngram_diff(args, parser):
    """Outputs the results of performing a diff query."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args)
    tokenizer = utils.get_tokenizer(args)
    store.validate(corpus, catalogue)
    if args.asymmetric:
        store.diff_asymmetric(catalogue, args.asymmetric, tokenizer,
                              sys.stdout)
    else:
        store.diff(catalogue, tokenizer, sys.stdout)
Beispiel #8
0
def main():
    parser = generate_parser()
    args = parser.parse_args()
    if hasattr(args, 'verbose'):
        utils.configure_logging(args.verbose, logger)
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args)
    tokenizer = utils.get_tokenizer(args)
    check_catalogue(catalogue, args.label)
    store.validate(corpus, catalogue)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        logger.warning('Output directory already exists; any results therein '
                       'will be reused rather than regenerated.')
    os.makedirs(output_dir, exist_ok=True)
    report = tacl.JitCReport(store, corpus, tokenizer)
    report.generate(output_dir, catalogue, args.label)
Beispiel #9
0
def main():
    parser = generate_parser()
    args = parser.parse_args()
    if hasattr(args, 'verbose'):
        utils.configure_logging(args.verbose, logger)
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args.catalogue)
    tokenizer = utils.get_tokenizer(args)
    check_catalogue(catalogue, args.label)
    store.validate(corpus, catalogue)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        logger.warning('Output directory already exists; any results therein '
                       'will be reused rather than regenerated.')
    os.makedirs(output_dir, exist_ok=True)
    report = tacl.JitCReport(store, corpus, tokenizer)
    report.generate(output_dir, catalogue, args.label)
Beispiel #10
0
def validate_catalogue(args):
    try:
        catalogue = utils.get_catalogue(args.catalogue)
    except tacl.exceptions.MalformedCatalogueError as e:
        print("Error: {}".format(e))
        print("Other errors may be present; re-run this validation after " "correcting the above problem.")
        sys.exit(1)
    corpus = utils.get_corpus(args)
    has_error = False
    for name in catalogue:
        count = 0
        for work in corpus.get_witnesses(name):
            count += 1
            break
        if not count:
            has_error = True
            print("Error: Catalogue references work {} that does not " "exist in the corpus".format(name))
    if has_error:
        sys.exit(1)
Beispiel #11
0
def results(args, parser):
    if args.results == "-":
        results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8", newline="")
    else:
        results_fh = open(args.results, "r", encoding="utf-8", newline="")
    tokenizer = utils.get_tokenizer(args)
    results = tacl.Results(results_fh, tokenizer)
    if args.extend:
        corpus = tacl.Corpus(args.extend, tokenizer)
        results.extend(corpus)
    if args.bifurcated_extend:
        if not args.bifurcated_extend_size:
            parser.error("The bifurcated extend option requires that the " "--max-be-count option also be supplied")
        corpus = tacl.Corpus(args.bifurcated_extend, tokenizer)
        results.bifurcated_extend(corpus, args.bifurcated_extend_size)
    if args.reduce:
        results.reduce()
    if args.reciprocal:
        results.reciprocal_remove()
    if args.zero_fill:
        if not args.catalogue:
            parser.error("The zero-fill option requires that the -c option " "also be supplied.")
        corpus = tacl.Corpus(args.zero_fill, tokenizer)
        catalogue = utils.get_catalogue(args.catalogue)
        results.zero_fill(corpus, catalogue)
    if args.ngrams:
        with open(args.ngrams, encoding="utf-8") as fh:
            ngrams = fh.read().split()
        results.prune_by_ngram(ngrams)
    if args.min_works or args.max_works:
        results.prune_by_work_count(args.min_works, args.max_works)
    if args.min_size or args.max_size:
        results.prune_by_ngram_size(args.min_size, args.max_size)
    if args.min_count or args.max_count:
        results.prune_by_ngram_count(args.min_count, args.max_count)
    if args.min_count_work or args.max_count_work:
        results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work)
    if args.remove:
        results.remove_label(args.remove)
    if args.sort:
        results.sort()
    results.csv(sys.stdout)