def combine_files_in_the_pdf_directory(filepath):

    sorted_pagefilepaths = sorted(glob.glob(filepath + "*.processed"))

    document = []
    for page_filepath in sorted_pagefilepaths:
        with open(page_filepath, "r") as f:
            document += [x for x in tokenize(" ".join(f.readlines()))]

    return document
    args = parser.parse_args()

    logging.basicConfig(filename=args.log_filepath,
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    if args.command == "construct_vocab":

        dictionary = corpora.Dictionary()

        count = 0

        line = sys.stdin.readline()
        while line:

            tokens = tokenize(line)

            dictionary.add_documents([tokens], prune_at=None)
            count += 1

            if count % 100000 == 0:
                print_err("line %d %d" % (count, len(dictionary)))

            line = sys.stdin.readline()

        dictionary.save(args.vocabulary_filename)
        dictionary.save_as_text(args.vocabulary_filename + ".txt")

    elif args.command == "construct_corpus":
        # use glob to recurse under data/TXTs directory
 def preprocess_text(self, text):
     return tokenize(text)