def preprocess(filename, stopword_filename=None, extra_stopwords=None): """ Preprocesses a CSV file and returns ... Arguments: filename -- name of CSV file Keyword arguments: stopword_filename -- name of file containing stopwords extra_stopwords -- list of additional stopwords """ stopwords = create_stopword_list(stopword_filename) stopwords.update(create_stopword_list(extra_stopwords)) corpus = Corpus() for fields in reader(open(filename), delimiter=','): corpus.add(fields[0], tokenize(fields[-1], stopwords)) corpus.freeze() return corpus
def preprocess(filename, stopword_filename=None, extra_stopwords=None): """ Preprocesses a CSV file and returns ... Arguments: filename -- name of CSV file Keyword arguments: stopword_filename -- name of file containing stopwords extra_stopwords -- list of additional stopwords """ stopwords = create_stopword_list(stopword_filename) stopwords.update(create_stopword_list(extra_stopwords)) corpus = Corpus() for fields in reader(open(filename), delimiter='\t'): corpus.add(fields[0], tokenize(fields[-1], stopwords)) corpus.freeze() return corpus
for path in removed: corpus.remove(path) print('pid=', os.getpid()) # multiprocessing.set_start_method('forkserver') print_progress('parsing files.') mails = [ Mail(path) for path in created ] nr_total = len(mails) nr_done = 0 with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor: future_to_mail = { executor.submit(get_words_from_mail, maildir, mail): mail for mail in mails } for future in concurrent.futures.as_completed(future_to_mail): mail = future_to_mail[future] try: corpus.add(mail.path, future.result()) nr_done += 1 except Exception as e: print(mail.path) traceback.print_exc() nr_done += 1 with open('/dev/tty', 'w') as f: print(f'{nr_done} / {nr_total}', end='\r', flush=True, file=f) print_progress('saving.') voca.save(f'{FILENAME_VOCA}.new') corpus.save(f'{FILENAME_PATHS}.new', f'{FILENAME_INDEX}.new', f'{FILENAME_MATRIX}.new') os.rename(f'{FILENAME_VOCA}.new', FILENAME_VOCA) os.rename(f'{FILENAME_PATHS}.new', FILENAME_PATHS) os.rename(f'{FILENAME_INDEX}.new', FILENAME_INDEX)