def run(): k = args.minlength words = sorted(word.lower() for word in dx1.read_file(args.file) if len(word) >= k) chunk_trie_ltr, chunk_trie_rtl = make_tries(words, k) if args.output_file: with open(args.output_file, 'w') as output_file: produce_output(chunk_trie_ltr, chunk_trie_rtl, output_file) else: produce_output(chunk_trie_ltr, chunk_trie_rtl, sys.stdout)
def run_lt(strings, min_count, verbose=False): seqs = get_sequences(strings, min_count, verbose=False) output_sequences(seqs) def run_ngrams(tokens_fp, min_count, verbose=False): """ Run with string tokens instead of characters. :param tokens_fp: path to a binary object with string tokens :param min_count: minimum n-gram count to include in the output :param verbose: trigger verbose mode :return: """ with open(tokens_fp, 'rb') as tokens_file: tokens = pickle.load(tokens_file) run_lt(tokens, min_count, verbose=verbose) if __name__ == '__main__': arg_parser = ArgumentParser() arg_parser.add_argument('file', help='dx1 file with strings') arg_parser.add_argument('--min-count', type=int, default=5) args = arg_parser.parse_args() import dx1 strings = dx1.read_file(args.file) run_lt(strings, args.min_count)
print_to_file('signatures for affixes') pprint.pprint(sort_by_size(signatures_affixes), stream=out_file) # output every word to a separate file words_file = Path(corpus_name + '_words.txt') with words_file.open('w') as out_file: for word in words: print(word, file=out_file) if __name__ == '__main__': arg_parser = ArgumentParser() arg_parser.add_argument('file', help='dx1 file for input') arg_parser.add_argument('--min-length', type=int, default=5, help='minimum substring length') arg_parser.add_argument('--num-words', type=int, default=200, help='number of most frequently occurring strings to get') arg_parser.add_argument('--verbose', action='store_true', help='verbose mode') args = arg_parser.parse_args() data_file = Path(args.file) corpus_name = data_file.stem add_to_log = write_log(corpus_name) data = dx1.read_file(data_file) result = run(data, args.min_length, args.num_words, args.verbose) # output_result(result, corpus_name) add_to_log(None)