コード例 #1
0
ファイル: models.py プロジェクト: AdrienGuille/EGC-Cup-2016
def build_unsup_nmf_topics(level="word", ngrams=(1, 3), n_topics=14):
    n_features = 1000

    vectorizer = TfidfVectorizer(ngram_range=ngrams, analyzer=level, max_df=0.80, min_df=4, max_features=n_features,
                                 use_idf=True, stop_words=load_stopword_list("../ngrams/stopwords.txt"),
                                 tokenizer=french_tokenizer)
    nmf = NMF(n_components=n_topics, random_state=1000)
    pipeline = Pipeline([("vectorize", vectorizer), ("clust", nmf)])

    return pipeline
コード例 #2
0
ファイル: models.py プロジェクト: AdrienGuille/EGC-Cup-2016
def build_unsup_nmf_locations(level="word", ngrams=(1, 3)):
    n_features = 1000
    n_topics = 40
    local_stop = ["cedex", "rue", "umr", "cnrs", "paris", "gmail", "com"]
    vectorizer = TfidfVectorizer(ngram_range=ngrams, analyzer=level, max_df=0.85, min_df=2, max_features=n_features,
                                 token_pattern=r"(?u)\b\w\w+\b",
                                 use_idf=True, stop_words=load_stopword_list("../ngrams/stopwords.txt") + local_stop,
                                 )
    nmf = NMF(n_components=n_topics, random_state=1000)
    pipeline = Pipeline([("vectorize", vectorizer), ("clust", nmf)])
    return pipeline
コード例 #3
0
ファイル: models.py プロジェクト: AdrienGuille/EGC-Cup-2016
def build_unsup_nmf_topics(level="word", ngrams=(1, 3), n_topics=14):
    n_features = 1000

    vectorizer = TfidfVectorizer(
        ngram_range=ngrams,
        analyzer=level,
        max_df=0.80,
        min_df=4,
        max_features=n_features,
        use_idf=True,
        stop_words=load_stopword_list("../ngrams/stopwords.txt"),
        tokenizer=french_tokenizer)
    nmf = NMF(n_components=n_topics, random_state=1000)
    pipeline = Pipeline([("vectorize", vectorizer), ("clust", nmf)])

    return pipeline
コード例 #4
0
ファイル: models.py プロジェクト: AdrienGuille/EGC-Cup-2016
def build_unsup_nmf_locations(level="word", ngrams=(1, 3)):
    n_features = 1000
    n_topics = 40
    local_stop = ["cedex", "rue", "umr", "cnrs", "paris", "gmail", "com"]
    vectorizer = TfidfVectorizer(
        ngram_range=ngrams,
        analyzer=level,
        max_df=0.85,
        min_df=2,
        max_features=n_features,
        token_pattern=r"(?u)\b\w\w+\b",
        use_idf=True,
        stop_words=load_stopword_list("../ngrams/stopwords.txt") + local_stop,
    )
    nmf = NMF(n_components=n_topics, random_state=1000)
    pipeline = Pipeline([("vectorize", vectorizer), ("clust", nmf)])
    return pipeline