def excise(args, parser): logger = colorlog.getLogger('tacl') tokenizer = utils.get_tokenizer(args) corpus = tacl.Corpus(args.corpus, tokenizer) with open(args.ngrams) as fh: ngrams = [line.strip() for line in fh.readlines()] # It is no issue if the output directory already exists; it is a # reasonable use case to create an excised corpus from multiple # excise operations. try: os.mkdir(args.output) except FileExistsError: pass for work in args.works: # It is worth warning about writing in existing work # directories, however, since that might be unintended. Do not # prevent this, however, since it is a reasonable use case. try: os.mkdir(os.path.join(args.output, work)) except FileExistsError: logger.warn(constants.EXCISE_OVERWRITE_WORK_WARNING.format(work)) for witness in corpus.get_witnesses(work): path = os.path.join(args.output, witness.get_filename()) content = witness.excise(ngrams, args.replacement) with open(path, 'w') as fh: fh.write(content)
def highlight_text(args, parser): """Outputs the result of highlighting a text.""" tokenizer = utils.get_tokenizer(args) corpus = utils.get_corpus(args) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): parser.exit(status=3, message='Output directory already exists, ' 'aborting.\n') os.makedirs(output_dir, exist_ok=True) if args.ngrams: if args.label is None or len(args.label) != len(args.ngrams): parser.error('There must be as many labels as there are files ' 'of n-grams') report = tacl.NgramHighlightReport(corpus, tokenizer) ngrams = [] for ngram_file in args.ngrams: ngrams.append(utils.get_ngrams(ngram_file)) minus_ngrams = [] if args.minus_ngrams: minus_ngrams = utils.get_ngrams(args.minus_ngrams) report.generate(args.output, args.base_name, ngrams, args.label, minus_ngrams) else: report = tacl.ResultsHighlightReport(corpus, tokenizer) report.generate(args.output, args.base_name, args.results)
def results(args, parser): if args.results == '-': results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results_fh = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = utils.get_tokenizer(args) results = tacl.Results(results_fh, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) results.extend(corpus) if args.bifurcated_extend: if not args.bifurcated_extend_size: parser.error('The bifurcated extend option requires that the ' '--max-be-count option also be supplied') corpus = tacl.Corpus(args.bifurcated_extend, tokenizer) results.bifurcated_extend(corpus, args.bifurcated_extend_size) if args.reduce: results.reduce() if args.reciprocal: results.reciprocal_remove() if args.excise: results.excise(args.excise) if args.zero_fill: corpus = tacl.Corpus(args.zero_fill, tokenizer) results.zero_fill(corpus) if args.ngrams: with open(args.ngrams, encoding='utf-8') as fh: ngrams = fh.read().split() results.prune_by_ngram(ngrams) if args.min_works or args.max_works: results.prune_by_work_count(args.min_works, args.max_works) if args.min_size or args.max_size: results.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: results.prune_by_ngram_count(args.min_count, args.max_count) if args.min_count_work or args.max_count_work: results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work) if args.remove: results.remove_label(args.remove) if args.sort: results.sort() # Run format-changing operations last. if args.add_label_count: results.add_label_count() if args.add_label_work_count: results.add_label_work_count() if args.group_by_ngram: catalogue = tacl.Catalogue() catalogue.load(args.group_by_ngram) results.group_by_ngram(catalogue.ordered_labels) if args.group_by_witness: results.group_by_witness() if args.collapse_witnesses: results.collapse_witnesses() results.csv(sys.stdout)
def align_results(args, parser): if args.results == "-": results = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8", newline="") else: results = open(args.results, "r", encoding="utf-8", newline="") tokenizer = utils.get_tokenizer(args) corpus = tacl.Corpus(args.corpus, tokenizer) report = tacl.SequenceReport(corpus, tokenizer, results) report.generate(args.output, args.minimum)
def ngram_diff(args, parser): """Outputs the results of performing a diff query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args.catalogue) tokenizer = utils.get_tokenizer(args) store.validate(corpus, catalogue) if args.asymmetric: store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout) else: store.diff(catalogue, tokenizer, sys.stdout)
def align_results(args, parser): if args.results == '-': results = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = utils.get_tokenizer(args) corpus = tacl.Corpus(args.corpus, tokenizer) report = tacl.SequenceReport(corpus, tokenizer, results) report.generate(args.output, args.minimum)
def ngram_diff(args, parser): """Outputs the results of performing a diff query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) store.validate(corpus, catalogue) if args.asymmetric: store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout) else: store.diff(catalogue, tokenizer, sys.stdout)
def main(): parser = generate_parser() args = parser.parse_args() if hasattr(args, 'verbose'): utils.configure_logging(args.verbose, logger) store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) check_catalogue(catalogue, args.label) store.validate(corpus, catalogue) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): logger.warning('Output directory already exists; any results therein ' 'will be reused rather than regenerated.') os.makedirs(output_dir, exist_ok=True) report = tacl.JitCReport(store, corpus, tokenizer) report.generate(output_dir, catalogue, args.label)
def main(): parser = generate_parser() args = parser.parse_args() if hasattr(args, 'verbose'): utils.configure_logging(args.verbose, logger) store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args.catalogue) tokenizer = utils.get_tokenizer(args) check_catalogue(catalogue, args.label) store.validate(corpus, catalogue) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): logger.warning('Output directory already exists; any results therein ' 'will be reused rather than regenerated.') os.makedirs(output_dir, exist_ok=True) report = tacl.JitCReport(store, corpus, tokenizer) report.generate(output_dir, catalogue, args.label)
def results(args, parser): if args.results == "-": results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8", newline="") else: results_fh = open(args.results, "r", encoding="utf-8", newline="") tokenizer = utils.get_tokenizer(args) results = tacl.Results(results_fh, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) results.extend(corpus) if args.bifurcated_extend: if not args.bifurcated_extend_size: parser.error("The bifurcated extend option requires that the " "--max-be-count option also be supplied") corpus = tacl.Corpus(args.bifurcated_extend, tokenizer) results.bifurcated_extend(corpus, args.bifurcated_extend_size) if args.reduce: results.reduce() if args.reciprocal: results.reciprocal_remove() if args.zero_fill: if not args.catalogue: parser.error("The zero-fill option requires that the -c option " "also be supplied.") corpus = tacl.Corpus(args.zero_fill, tokenizer) catalogue = utils.get_catalogue(args.catalogue) results.zero_fill(corpus, catalogue) if args.ngrams: with open(args.ngrams, encoding="utf-8") as fh: ngrams = fh.read().split() results.prune_by_ngram(ngrams) if args.min_works or args.max_works: results.prune_by_work_count(args.min_works, args.max_works) if args.min_size or args.max_size: results.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: results.prune_by_ngram_count(args.min_count, args.max_count) if args.min_count_work or args.max_count_work: results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work) if args.remove: results.remove_label(args.remove) if args.sort: results.sort() results.csv(sys.stdout)
def highlight_text(args, parser): """Outputs the result of highlighting a text.""" tokenizer = utils.get_tokenizer(args) corpus = utils.get_corpus(args) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): parser.exit(status=3, message="Output directory already exists, " "aborting.\n") os.makedirs(output_dir, exist_ok=True) if args.ngrams: if args.label is None or len(args.label) != len(args.ngrams): parser.error("There must be as many labels as there are files " "of n-grams") report = tacl.NgramHighlightReport(corpus, tokenizer) ngrams = [] for ngram_file in args.ngrams: ngrams.append(utils.get_ngrams(ngram_file)) minus_ngrams = [] if args.minus_ngrams: minus_ngrams = utils.get_ngrams(args.minus_ngrams) report.generate(args.output, args.base_name, ngrams, args.label, minus_ngrams) else: report = tacl.ResultsHighlightReport(corpus, tokenizer) report.generate(args.output, args.base_name, args.results)
def collapse_witnesses(args): results = tacl.Results(args.results, utils.get_tokenizer(args)) results.collapse_witnesses() results.csv(sys.stdout)
def label_count(args): results = tacl.Results(args.results, utils.get_tokenizer(args)) results.add_label_count() results.csv(sys.stdout)
def supplied_diff(args, parser): labels = args.labels results = args.supplied store = utils.get_data_store(args) tokenizer = utils.get_tokenizer(args) store.diff_supplied(results, labels, tokenizer, sys.stdout)
def generate_statistics(args, parser): corpus = utils.get_corpus(args) tokenizer = utils.get_tokenizer(args) report = tacl.StatisticsReport(corpus, tokenizer, args.results) report.generate_statistics() report.csv(sys.stdout)