Ejemplo n.º 1
0
def excise(args, parser):
    logger = colorlog.getLogger('tacl')
    tokenizer = utils.get_tokenizer(args)
    corpus = tacl.Corpus(args.corpus, tokenizer)
    with open(args.ngrams) as fh:
        ngrams = [line.strip() for line in fh.readlines()]
    # It is no issue if the output directory already exists; it is a
    # reasonable use case to create an excised corpus from multiple
    # excise operations.
    try:
        os.mkdir(args.output)
    except FileExistsError:
        pass
    for work in args.works:
        # It is worth warning about writing in existing work
        # directories, however, since that might be unintended. Do not
        # prevent this, however, since it is a reasonable use case.
        try:
            os.mkdir(os.path.join(args.output, work))
        except FileExistsError:
            logger.warn(constants.EXCISE_OVERWRITE_WORK_WARNING.format(work))
        for witness in corpus.get_witnesses(work):
            path = os.path.join(args.output, witness.get_filename())
            content = witness.excise(ngrams, args.replacement)
            with open(path, 'w') as fh:
                fh.write(content)
Ejemplo n.º 2
0
def highlight_text(args, parser):
    """Outputs the result of highlighting a text."""
    tokenizer = utils.get_tokenizer(args)
    corpus = utils.get_corpus(args)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        parser.exit(status=3,
                    message='Output directory already exists, '
                    'aborting.\n')
    os.makedirs(output_dir, exist_ok=True)
    if args.ngrams:
        if args.label is None or len(args.label) != len(args.ngrams):
            parser.error('There must be as many labels as there are files '
                         'of n-grams')
        report = tacl.NgramHighlightReport(corpus, tokenizer)
        ngrams = []
        for ngram_file in args.ngrams:
            ngrams.append(utils.get_ngrams(ngram_file))
        minus_ngrams = []
        if args.minus_ngrams:
            minus_ngrams = utils.get_ngrams(args.minus_ngrams)
        report.generate(args.output, args.base_name, ngrams, args.label,
                        minus_ngrams)
    else:
        report = tacl.ResultsHighlightReport(corpus, tokenizer)
        report.generate(args.output, args.base_name, args.results)
Ejemplo n.º 3
0
def results(args, parser):
    if args.results == '-':
        results_fh = io.TextIOWrapper(sys.stdin.buffer,
                                      encoding='utf-8',
                                      newline='')
    else:
        results_fh = open(args.results, 'r', encoding='utf-8', newline='')
    tokenizer = utils.get_tokenizer(args)
    results = tacl.Results(results_fh, tokenizer)
    if args.extend:
        corpus = tacl.Corpus(args.extend, tokenizer)
        results.extend(corpus)
    if args.bifurcated_extend:
        if not args.bifurcated_extend_size:
            parser.error('The bifurcated extend option requires that the '
                         '--max-be-count option also be supplied')
        corpus = tacl.Corpus(args.bifurcated_extend, tokenizer)
        results.bifurcated_extend(corpus, args.bifurcated_extend_size)
    if args.reduce:
        results.reduce()
    if args.reciprocal:
        results.reciprocal_remove()
    if args.excise:
        results.excise(args.excise)
    if args.zero_fill:
        corpus = tacl.Corpus(args.zero_fill, tokenizer)
        results.zero_fill(corpus)
    if args.ngrams:
        with open(args.ngrams, encoding='utf-8') as fh:
            ngrams = fh.read().split()
        results.prune_by_ngram(ngrams)
    if args.min_works or args.max_works:
        results.prune_by_work_count(args.min_works, args.max_works)
    if args.min_size or args.max_size:
        results.prune_by_ngram_size(args.min_size, args.max_size)
    if args.min_count or args.max_count:
        results.prune_by_ngram_count(args.min_count, args.max_count)
    if args.min_count_work or args.max_count_work:
        results.prune_by_ngram_count_per_work(args.min_count_work,
                                              args.max_count_work)
    if args.remove:
        results.remove_label(args.remove)
    if args.sort:
        results.sort()
    # Run format-changing operations last.
    if args.add_label_count:
        results.add_label_count()
    if args.add_label_work_count:
        results.add_label_work_count()
    if args.group_by_ngram:
        catalogue = tacl.Catalogue()
        catalogue.load(args.group_by_ngram)
        results.group_by_ngram(catalogue.ordered_labels)
    if args.group_by_witness:
        results.group_by_witness()
    if args.collapse_witnesses:
        results.collapse_witnesses()
    results.csv(sys.stdout)
Ejemplo n.º 4
0
def align_results(args, parser):
    if args.results == "-":
        results = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8", newline="")
    else:
        results = open(args.results, "r", encoding="utf-8", newline="")
    tokenizer = utils.get_tokenizer(args)
    corpus = tacl.Corpus(args.corpus, tokenizer)
    report = tacl.SequenceReport(corpus, tokenizer, results)
    report.generate(args.output, args.minimum)
Ejemplo n.º 5
0
def ngram_diff(args, parser):
    """Outputs the results of performing a diff query."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args.catalogue)
    tokenizer = utils.get_tokenizer(args)
    store.validate(corpus, catalogue)
    if args.asymmetric:
        store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout)
    else:
        store.diff(catalogue, tokenizer, sys.stdout)
Ejemplo n.º 6
0
def align_results(args, parser):
    if args.results == '-':
        results = io.TextIOWrapper(sys.stdin.buffer,
                                   encoding='utf-8',
                                   newline='')
    else:
        results = open(args.results, 'r', encoding='utf-8', newline='')
    tokenizer = utils.get_tokenizer(args)
    corpus = tacl.Corpus(args.corpus, tokenizer)
    report = tacl.SequenceReport(corpus, tokenizer, results)
    report.generate(args.output, args.minimum)
Ejemplo n.º 7
0
def ngram_diff(args, parser):
    """Outputs the results of performing a diff query."""
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args)
    tokenizer = utils.get_tokenizer(args)
    store.validate(corpus, catalogue)
    if args.asymmetric:
        store.diff_asymmetric(catalogue, args.asymmetric, tokenizer,
                              sys.stdout)
    else:
        store.diff(catalogue, tokenizer, sys.stdout)
Ejemplo n.º 8
0
def main():
    parser = generate_parser()
    args = parser.parse_args()
    if hasattr(args, 'verbose'):
        utils.configure_logging(args.verbose, logger)
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args)
    tokenizer = utils.get_tokenizer(args)
    check_catalogue(catalogue, args.label)
    store.validate(corpus, catalogue)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        logger.warning('Output directory already exists; any results therein '
                       'will be reused rather than regenerated.')
    os.makedirs(output_dir, exist_ok=True)
    report = tacl.JitCReport(store, corpus, tokenizer)
    report.generate(output_dir, catalogue, args.label)
Ejemplo n.º 9
0
def main():
    parser = generate_parser()
    args = parser.parse_args()
    if hasattr(args, 'verbose'):
        utils.configure_logging(args.verbose, logger)
    store = utils.get_data_store(args)
    corpus = utils.get_corpus(args)
    catalogue = utils.get_catalogue(args.catalogue)
    tokenizer = utils.get_tokenizer(args)
    check_catalogue(catalogue, args.label)
    store.validate(corpus, catalogue)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        logger.warning('Output directory already exists; any results therein '
                       'will be reused rather than regenerated.')
    os.makedirs(output_dir, exist_ok=True)
    report = tacl.JitCReport(store, corpus, tokenizer)
    report.generate(output_dir, catalogue, args.label)
Ejemplo n.º 10
0
def results(args, parser):
    if args.results == "-":
        results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8", newline="")
    else:
        results_fh = open(args.results, "r", encoding="utf-8", newline="")
    tokenizer = utils.get_tokenizer(args)
    results = tacl.Results(results_fh, tokenizer)
    if args.extend:
        corpus = tacl.Corpus(args.extend, tokenizer)
        results.extend(corpus)
    if args.bifurcated_extend:
        if not args.bifurcated_extend_size:
            parser.error("The bifurcated extend option requires that the " "--max-be-count option also be supplied")
        corpus = tacl.Corpus(args.bifurcated_extend, tokenizer)
        results.bifurcated_extend(corpus, args.bifurcated_extend_size)
    if args.reduce:
        results.reduce()
    if args.reciprocal:
        results.reciprocal_remove()
    if args.zero_fill:
        if not args.catalogue:
            parser.error("The zero-fill option requires that the -c option " "also be supplied.")
        corpus = tacl.Corpus(args.zero_fill, tokenizer)
        catalogue = utils.get_catalogue(args.catalogue)
        results.zero_fill(corpus, catalogue)
    if args.ngrams:
        with open(args.ngrams, encoding="utf-8") as fh:
            ngrams = fh.read().split()
        results.prune_by_ngram(ngrams)
    if args.min_works or args.max_works:
        results.prune_by_work_count(args.min_works, args.max_works)
    if args.min_size or args.max_size:
        results.prune_by_ngram_size(args.min_size, args.max_size)
    if args.min_count or args.max_count:
        results.prune_by_ngram_count(args.min_count, args.max_count)
    if args.min_count_work or args.max_count_work:
        results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work)
    if args.remove:
        results.remove_label(args.remove)
    if args.sort:
        results.sort()
    results.csv(sys.stdout)
Ejemplo n.º 11
0
def highlight_text(args, parser):
    """Outputs the result of highlighting a text."""
    tokenizer = utils.get_tokenizer(args)
    corpus = utils.get_corpus(args)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        parser.exit(status=3, message="Output directory already exists, " "aborting.\n")
    os.makedirs(output_dir, exist_ok=True)
    if args.ngrams:
        if args.label is None or len(args.label) != len(args.ngrams):
            parser.error("There must be as many labels as there are files " "of n-grams")
        report = tacl.NgramHighlightReport(corpus, tokenizer)
        ngrams = []
        for ngram_file in args.ngrams:
            ngrams.append(utils.get_ngrams(ngram_file))
        minus_ngrams = []
        if args.minus_ngrams:
            minus_ngrams = utils.get_ngrams(args.minus_ngrams)
        report.generate(args.output, args.base_name, ngrams, args.label, minus_ngrams)
    else:
        report = tacl.ResultsHighlightReport(corpus, tokenizer)
        report.generate(args.output, args.base_name, args.results)
Ejemplo n.º 12
0
def collapse_witnesses(args):
    results = tacl.Results(args.results, utils.get_tokenizer(args))
    results.collapse_witnesses()
    results.csv(sys.stdout)
Ejemplo n.º 13
0
def label_count(args):
    results = tacl.Results(args.results, utils.get_tokenizer(args))
    results.add_label_count()
    results.csv(sys.stdout)
Ejemplo n.º 14
0
def supplied_diff(args, parser):
    labels = args.labels
    results = args.supplied
    store = utils.get_data_store(args)
    tokenizer = utils.get_tokenizer(args)
    store.diff_supplied(results, labels, tokenizer, sys.stdout)
Ejemplo n.º 15
0
def generate_statistics(args, parser):
    corpus = utils.get_corpus(args)
    tokenizer = utils.get_tokenizer(args)
    report = tacl.StatisticsReport(corpus, tokenizer, args.results)
    report.generate_statistics()
    report.csv(sys.stdout)
Ejemplo n.º 16
0
def generate_statistics(args, parser):
    corpus = utils.get_corpus(args)
    tokenizer = utils.get_tokenizer(args)
    report = tacl.StatisticsReport(corpus, tokenizer, args.results)
    report.generate_statistics()
    report.csv(sys.stdout)
Ejemplo n.º 17
0
def supplied_diff(args, parser):
    labels = args.labels
    results = args.supplied
    store = utils.get_data_store(args)
    tokenizer = utils.get_tokenizer(args)
    store.diff_supplied(results, labels, tokenizer, sys.stdout)