def combine_files_in_the_pdf_directory(filepath): sorted_pagefilepaths = sorted(glob.glob(filepath + "*.processed")) document = [] for page_filepath in sorted_pagefilepaths: with open(page_filepath, "r") as f: document += [x for x in tokenize(" ".join(f.readlines()))] return document
args = parser.parse_args() logging.basicConfig(filename=args.log_filepath, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if args.command == "construct_vocab": dictionary = corpora.Dictionary() count = 0 line = sys.stdin.readline() while line: tokens = tokenize(line) dictionary.add_documents([tokens], prune_at=None) count += 1 if count % 100000 == 0: print_err("line %d %d" % (count, len(dictionary))) line = sys.stdin.readline() dictionary.save(args.vocabulary_filename) dictionary.save_as_text(args.vocabulary_filename + ".txt") elif args.command == "construct_corpus": # use glob to recurse under data/TXTs directory
def preprocess_text(self, text): return tokenize(text)