Exemple #1
0
def preprocess(filename, stopword_filename=None, extra_stopwords=None):
    """
    Preprocesses a CSV file and returns ...

    Arguments:

    filename -- name of CSV file

    Keyword arguments:

    stopword_filename -- name of file containing stopwords
    extra_stopwords -- list of additional stopwords
    """

    stopwords = create_stopword_list(stopword_filename)
    stopwords.update(create_stopword_list(extra_stopwords))

    corpus = Corpus()

    for fields in reader(open(filename), delimiter=','):
        corpus.add(fields[0], tokenize(fields[-1], stopwords))

    corpus.freeze()

    return corpus
Exemple #2
0
def preprocess(filename, stopword_filename=None, extra_stopwords=None):
    """
    Preprocesses a CSV file and returns ...

    Arguments:

    filename -- name of CSV file

    Keyword arguments:

    stopword_filename -- name of file containing stopwords
    extra_stopwords -- list of additional stopwords
    """

    stopwords = create_stopword_list(stopword_filename)
    stopwords.update(create_stopword_list(extra_stopwords))

    corpus = Corpus()

    for fields in reader(open(filename), delimiter='\t'):
        corpus.add(fields[0], tokenize(fields[-1], stopwords))

    corpus.freeze()

    return corpus
Exemple #3
0
    for path in removed:
        corpus.remove(path)

    print('pid=', os.getpid())
    # multiprocessing.set_start_method('forkserver')

    print_progress('parsing files.')
    mails = [ Mail(path) for path in created ]
    nr_total = len(mails)
    nr_done = 0
    with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_mail = { executor.submit(get_words_from_mail, maildir, mail): mail for mail in mails }
        for future in concurrent.futures.as_completed(future_to_mail):
            mail = future_to_mail[future]
            try:
                corpus.add(mail.path, future.result())
                nr_done += 1
            except Exception as e:
                print(mail.path)
                traceback.print_exc()
                nr_done += 1
            with open('/dev/tty', 'w') as f:
                print(f'{nr_done} / {nr_total}', end='\r', flush=True, file=f)
    
    print_progress('saving.')
    voca.save(f'{FILENAME_VOCA}.new')
    corpus.save(f'{FILENAME_PATHS}.new', f'{FILENAME_INDEX}.new', f'{FILENAME_MATRIX}.new')

    os.rename(f'{FILENAME_VOCA}.new', FILENAME_VOCA)
    os.rename(f'{FILENAME_PATHS}.new', FILENAME_PATHS)
    os.rename(f'{FILENAME_INDEX}.new', FILENAME_INDEX)