Ejemplo n.º 1
0
def train_mgp(spacy_lang='pt_core_news_sm', train_file='train.csv', category_column='category', title_column='title', number_regex='[0-9]', number_code='NUMBER', model_file='mgp.model', scores_file='scores.npy'):
    nlp = spacy.load(spacy_lang)
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    df = pd.read_csv(train_file)
    len_categories = len(df[category_column].drop_duplicates().values.tolist())
    mgp = MovieGroupProcess(K=len_categories+100, alpha=0.1, beta=0.1, n_iters=10)
    docs = df[title_column].values.tolist()
    tokens = []
    for item in docs:
        processed_item = re.sub(number_regex, number_code, item.lower())
        tmp = tokenizer(processed_item)
        tokens.append([str(x) for x in tmp if not (x.is_punct or x.is_stop)])

    tokens_freq_dict = dict(
        Counter([x for y in tokens for x in y]).most_common())
    for idx, item in enumerate(tokens):
        tokens[idx] = list(
            filter(lambda x: tokens_freq_dict[x] > 1, tokens[idx]))

    vocab_size = len(set(x for y in tokens for x in y))
    y = mgp.fit(tokens, vocab_size)
    scores = []
    for item in tokens:
        scores.append(np.array(mgp.score(item)))
    scores = np.array(scores)
    with open(model_file, 'wb') as f:
        pickle.dump(mgp, f)
        f.close()
    np.save(scores_file, scores)
Ejemplo n.º 2
0
def sentences_to_gsdmm(sentences, num_topics):
    # return dictionary, topic ndarray, and scores for each document
    dictionary, corpus = get_dict_corpus(sentences)
    # corpus is the list of documents in token, BOW format.
    # a sequence of tuples where the first entry is the token ID,
    # and the second is the count of that token

    corpus_tokens = [[a[0] for a in sent] for sent in corpus]
    max_token = max([max(a) for a in corpus_tokens if a])
    mgp = MovieGroupProcess(K=num_topics, alpha=0.1, beta=0.5, n_iters=50)
    mgp.fit(corpus_tokens, max_token)
    topics = mgp.cluster_word_distribution
    # array of BOW dicts of token ids
    scores = [mgp.score(sentence) for sentence in corpus]
    # return topics represented as dicts of word => val
    # return scores represented as array (len docs) of arrays (len topics)
    mapping = {v: k for k, v in dictionary.token2id.items()}

    # create ndarray from topic map
    topic_ndarray = np.zeros((len(topics), max_token + 1))
    for i, topic in enumerate(topics):
        for k, v in topic.items():
            topic_ndarray[i][k] = v

    return dictionary, topic_ndarray, scores
Ejemplo n.º 3
0
mgp = MovieGroupProcess(K=10, alpha=0.73, beta=0.97, n_iters=30)

vocab = set(x for doc in tokenized_series for x in doc)
n_terms = len(vocab)
n_docs = len(tokenized_series)

# Fit the model on the data given the chosen seeds
print('Training GSDMM model....')

y = mgp.fit(tokenized_series, n_terms)

#probability the doc belongs to the topic

print(
    sum([
        max(mgp.score(tokenized_series[n]))
        for n in range(len(tokenized_series))
    ]) / len([
        max(mgp.score(tokenized_series[n]))
        for n in range(len(tokenized_series))
    ]))

doc_count = np.array(mgp.cluster_doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]

#top_index = doc_count.argsort()[-8:][::-1]

TopicsRunner(mgp.cluster_word_distribution, top_index, 10)