def ngram_intersection(args, parser): """Outputs the results of performing an intersection query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args.catalogue) store.validate(corpus, catalogue) store.intersection(catalogue, sys.stdout)
def ngram_intersection(args, parser): """Outputs the results of performing an intersection query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) store.validate(corpus, catalogue) store.intersection(catalogue, sys.stdout)
def search_texts(args, parser): """Searches texts for presence of n-grams.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) store.validate(corpus, catalogue) ngrams = utils.get_ngrams(args.ngrams) store.search(catalogue, ngrams, sys.stdout)
def generate_ngrams(args, parser): """Adds n-grams data to the data store.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) if args.catalogue: catalogue = utils.get_catalogue(args) else: catalogue = None store.add_ngrams(corpus, args.min_size, args.max_size, catalogue)
def generate_ngrams(args, parser): """Adds n-grams data to the data store.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) if args.catalogue: catalogue = utils.get_catalogue(args.catalogue) else: catalogue = None store.add_ngrams(corpus, args.min_size, args.max_size, catalogue)
def ngram_diff(args, parser): """Outputs the results of performing a diff query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args.catalogue) tokenizer = utils.get_tokenizer(args) store.validate(corpus, catalogue) if args.asymmetric: store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout) else: store.diff(catalogue, tokenizer, sys.stdout)
def ngram_diff(args, parser): """Outputs the results of performing a diff query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) store.validate(corpus, catalogue) if args.asymmetric: store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout) else: store.diff(catalogue, tokenizer, sys.stdout)
def main(): parser = generate_parser() args = parser.parse_args() if hasattr(args, 'verbose'): utils.configure_logging(args.verbose, logger) store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) check_catalogue(catalogue, args.label) store.validate(corpus, catalogue) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): logger.warning('Output directory already exists; any results therein ' 'will be reused rather than regenerated.') os.makedirs(output_dir, exist_ok=True) report = tacl.JitCReport(store, corpus, tokenizer) report.generate(output_dir, catalogue, args.label)
def main(): parser = generate_parser() args = parser.parse_args() if hasattr(args, 'verbose'): utils.configure_logging(args.verbose, logger) store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args.catalogue) tokenizer = utils.get_tokenizer(args) check_catalogue(catalogue, args.label) store.validate(corpus, catalogue) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): logger.warning('Output directory already exists; any results therein ' 'will be reused rather than regenerated.') os.makedirs(output_dir, exist_ok=True) report = tacl.JitCReport(store, corpus, tokenizer) report.generate(output_dir, catalogue, args.label)
def validate_catalogue(args): try: catalogue = utils.get_catalogue(args.catalogue) except tacl.exceptions.MalformedCatalogueError as e: print("Error: {}".format(e)) print("Other errors may be present; re-run this validation after " "correcting the above problem.") sys.exit(1) corpus = utils.get_corpus(args) has_error = False for name in catalogue: count = 0 for work in corpus.get_witnesses(name): count += 1 break if not count: has_error = True print("Error: Catalogue references work {} that does not " "exist in the corpus".format(name)) if has_error: sys.exit(1)
def results(args, parser): if args.results == "-": results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8", newline="") else: results_fh = open(args.results, "r", encoding="utf-8", newline="") tokenizer = utils.get_tokenizer(args) results = tacl.Results(results_fh, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) results.extend(corpus) if args.bifurcated_extend: if not args.bifurcated_extend_size: parser.error("The bifurcated extend option requires that the " "--max-be-count option also be supplied") corpus = tacl.Corpus(args.bifurcated_extend, tokenizer) results.bifurcated_extend(corpus, args.bifurcated_extend_size) if args.reduce: results.reduce() if args.reciprocal: results.reciprocal_remove() if args.zero_fill: if not args.catalogue: parser.error("The zero-fill option requires that the -c option " "also be supplied.") corpus = tacl.Corpus(args.zero_fill, tokenizer) catalogue = utils.get_catalogue(args.catalogue) results.zero_fill(corpus, catalogue) if args.ngrams: with open(args.ngrams, encoding="utf-8") as fh: ngrams = fh.read().split() results.prune_by_ngram(ngrams) if args.min_works or args.max_works: results.prune_by_work_count(args.min_works, args.max_works) if args.min_size or args.max_size: results.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: results.prune_by_ngram_count(args.min_count, args.max_count) if args.min_count_work or args.max_count_work: results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work) if args.remove: results.remove_label(args.remove) if args.sort: results.sort() results.csv(sys.stdout)