document_name = six.text_type(
                file_path.rstrip('.eml').lstrip(path_prefix))
            documents.append(document_name)
        except UnicodeDecodeError:  # So we will not use that file.
            del new_file_paths[i]
            print('Skipping file: ', file_path, '.')
    return documents, new_file_paths


print('Loading documents.')

if use_small_dataset:
    docs, input_type, tokenizer = dataset_small()
    paths = docs
else:
    unchecked_paths, input_type, tokenizer = dataset_mails(doc_path)
    docs, paths = get_document_names(doc_path, unchecked_paths)

print('Loaded ', len(docs), ' files from path: ', doc_path, '.')

print('Extracting features.')
vectorizer = create_vectorizer(input_type, tokenizer=tokenizer,
                               ngram_range=(1, 1))

data = vectorizer.fit_transform(paths)
features = vectorizer.get_feature_names()

data = coo_matrix(data)

data = apply_threshold(data, 0.1)  # Filter out everything, that is too weak.
Ejemplo n.º 2
0
from tools.datasets import dataset_mails

# TODO: move to scripts


def fit_lda(corpus, vocabulary, n_topics=10, passes=1):
    return LdaModel(corpus, num_topics=n_topics, passes=passes,
                    id2word={i: s for i, s in enumerate(vocabulary)})


def fit_hdp_lda(corpus, vocabulary):
    return HdpModel(corpus, {i: s for i, s in enumerate(vocabulary)})


if __name__ == '__main__':
    content, input_type, tokenizer = dataset_mails(
        '/Users/yanchith/workspace/won-corpora/processed')

    # content, input_type, tokenizer = dataset_newsgroups()

    vectorizer = TfidfVectorizer(min_df=3, input=input_type, ngram_range=(1, 1),
                                 stop_words='english', tokenizer=tokenizer)
    X = vectorizer.fit_transform(content)
    features = vectorizer.get_feature_names()

    print('Number of features:', len(features))
    print('Bag of words shape:', X.shape)
    print(features)

    # Beware, gensim requires the matrix transposed
    model = fit_hdp_lda(matutils.Sparse2Corpus(X, documents_columns=False),
                        features)