Ejemplo n.º 1
0
def make_item_descriptions(max_sentence_length=None):
    descriptions = pd.read_csv(os.path.join(
        'data', 'descriptions.csv')).rename(columns={'movie': 'item'})
    texts = descriptions.description
    texts = texts.apply(lambda x: x.strip().split())
    dictionary = Dictionary(texts.values)
    dictionary.filter_extremes()
    eos_id = len(dictionary.keys())

    # to index list
    texts = texts.apply(
        lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id))
    texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id]))
    max_sentence_length = max(
        texts.apply(len)) if max_sentence_length is None else min(
            max(texts.apply(len)), max_sentence_length)

    # padding
    texts = texts.apply(lambda x: x[:max_sentence_length])
    texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)),
                                         'constant',
                                         constant_values=(0, eos_id)))

    # change types
    texts = texts.apply(lambda x: x.astype(np.int32))
    descriptions.id = descriptions.id.astype(np.int32)

    return descriptions.id.values, texts.values, len(dictionary.keys()) + 1
Ejemplo n.º 2
0
class DataGenerator(object):
    def __init__(self,
                 positive_dataset: pd.DataFrame,
                 negative_dataset: pd.DataFrame,
                 test_size: float,
                 random_state: int = 123,
                 max_sentence_length: int = None):
        self.dataset = pd.concat([positive_dataset, negative_dataset], axis=0)
        self.dataset['review'] = self.dataset['review'].apply(
            lambda x: x.strip().split())
        self.dictionary = Dictionary(self.dataset['review'].values)
        self.dataset['review'] = self.dataset['review'].apply(
            self.dictionary.doc2idx)
        self.max_sentence_length = max_sentence_length
        if self.max_sentence_length is not None:
            self.dataset['review'] = self.dataset['review'].apply(
                lambda x: x[:self.max_sentence_length])
        else:
            self.max_sentence_length = max(self.dataset['review'].apply(len))

        # padding
        eos_id = len(self.dictionary.keys())
        self.dataset['review'] = self.dataset['review'].apply(
            lambda x: np.pad(x, (0, self.max_sentence_length - len(x)),
                             'constant',
                             constant_values=(0, eos_id)))

        # change type
        self.dataset['review'] = self.dataset['review'].apply(
            lambda x: x.astype(np.int32))
        self.dataset['label'] = self.dataset['label'].astype(np.int32)

        # split
        self.train, self.test = train_test_split(self.dataset,
                                                 test_size=test_size,
                                                 random_state=random_state)

    def get_train_dataset(self):
        return list(
            zip(self.train['review'].values, self.train['label'].values))

    def get_test_dataset(self):
        return list(zip(self.test['review'].values, self.test['label'].values))

    def get_max_sentence_length(self):
        return self.max_sentence_length

    def get_n_word(self):
        return len(self.dictionary.keys()) + 1
# fit an LDA model, n_topic = 5
news_dictionary = Dictionary(documents)
news_dictionary.filter_extremes(no_below=5,
                                no_above=0.5,
                                keep_n=5000,
                                keep_tokens=None)
corpus = [news_dictionary.doc2bow(text) for text in documents]
lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary)

lda.show_topics()

# convert gensim corpus to a sparse document-term matrix for coherence measure
corpus_dense = gensim.matutils.corpus2csc(corpus,
                                          num_terms=len(
                                              news_dictionary.keys()))
corpus_dense = corpus_dense.astype(int)
corpus_dense = corpus_dense.transpose()
print(corpus_dense.shape)


# implements the UMass coherence in Mimno et al. 2011 - Optimizing Semantic Coherence in Topic Models
def cooccur_df_ws(w1, w2, corpus_dense, w2ids):
    """
    Returns the co-document frequency of two words
    """
    w1_id, w2_id = w2ids.token2id.get(w1), w2ids.token2id.get(w2)
    co_freq_array = (corpus_dense[:, [w1_id, w2_id]] > 0).sum(axis=1).A1
    return np.count_nonzero(co_freq_array == 2)

Ejemplo n.º 4
0
class InfoGain(object):
    """
    计算标签的信息增益,输入的是标注了类别的语料库,
    计算信息增益首先计算全局的信息熵,然后计算每个标
    签的条件熵,相减就是信息增益。
    Example:
        >>> ig = InfoGain(corpus_file)
        >>> ig.compute()
        >>> ig.save(ig_file)
        >>> print(ig['word'])  # 查询一个词的信息增益
    """
    def __init__(self, corpus_file):
        """
        Args:
            corpus_file -- 语料文件,第一列是类别,后面都是标签
        """
        corpus = []
        categories = []
        self._category_distribution = {}  # 统计各个类别的样本数
        self._words_cate = {}  # 统计每个词(标签、特征)下的类别样本数
        self._words_sample_count = {}
        self._info_gain = {}
        with open(corpus_file, 'r') as documents:
            for line in documents:
                words = line.strip().split()
                if len(words) <= 1:
                    continue
                categories.append(words[0])
                corpus.append(words[1:])
                if words[0] not in self._category_distribution:
                    self._category_distribution[words[0]] = 0
                self._category_distribution[words[0]] += 1

                # 统计词(标签、特征)和类别的共现次数,用于计算条件熵
                for word in set(words[1:]):
                    if word not in self._words_cate:
                        self._words_cate[word] = {}
                        self._words_sample_count[word] = 0
                    if words[0] not in self._words_cate[word]:
                        self._words_cate[word][words[0]] = 0
                    self._words_cate[word][words[0]] += 1
                    self._words_sample_count[word] += 1

        self._common_dictionary = Dictionary(corpus)
        self._corpus = corpus
        self._categories = categories

    def compute(self):
        """
        计算所有词(标签、特征)的信息增益。首先计算全局的信息熵。
        """
        system_entropy = compute_entropy(len(self._corpus),
                                         self._category_distribution)
        # 计算每个词的条件熵
        for word in self._common_dictionary.keys():
            category_distribution = {}
            if word not in self._words_cate:
                continue
            # 出现该词(标签、特征)的类别分布信息熵
            entropy1 = compute_entropy(self._words_sample_count[word],
                                       self._words_cate[word])
            for cate in self._category_distribution:
                category_distribution[cate] = self._category_distribution[cate]
                if cate in self._words_cate[word]:
                    category_distribution[cate] -= self._words_cate[word]
            # 未出现该词(标签、特征)的类别分布信息熵
            entropy2 = (compute_entropy(len(self._corpus)
                                        - self._words_sample_count[word],
                                        category_distribution))
            # 该词(标签、特征)的条件熵
            condition_entropy = (self._words_sample_count[word] * entropy1/len(self._corpus)
                                 + (len(self._corpus) - self._words_sample_count[word])
                                 * entropy2/len(self._corpus))
            # 信息增益
            info_gain = system_entropy - condition_entropy
            self._info_gain[word] = info_gain


    def save(self, ig_file_name, sort=False):
        """
        保存到文件,格式为:词 信息增益
        Args:
            ig_file_name -- 文件路径
            sort -- 是否按照信息增益从高到低排序后输出,默认不排序
        """
        with open(ig_file_name, 'w') as ig_file:
            if not sort:
                for word in self._info_gain:
                    ig_file.write("%s %.2f\n" % (word, self._info_gain[word]))
            else:
                for item in sorted(self._info_gain.items(), key=lambda x: x[1], reverse=True):
                    ig_file.write("%s %.2f\n" % (item[0], item[1]))

    def __get_item__(self, word):
        if word not in self._info_gain:
            return 0.0
        return self._info_gain[word]
                continue
            if row[0] not in eids:
                eids.add(row[0])
                if len(row) == 8:
                    row.append(",".join(topic_word_list_result[row[0]]))
                else:
                    row[8] = ",".join(topic_word_list_result[row[0]])
                csvData.append(row)
    with open(filename, 'w', encoding='utf-8') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(csvData)
        print("Write to database successfully")


print("corpus length:", len(pubs_corpus))
print("dict length:", len(pubs_dictionary.keys()))
topic_word_list_result = dict()
topic_dict = topic_to_lemmatized_word_list(lda)

for id in pubs_eids:
    cur_corpus = id_to_corpus.get(id, None)
    if cur_corpus != None:
        candidate_topics = lda.get_document_topics(
            cur_corpus)  # list all topic index
        best_topic_index = select_highest_prob_topic(
            candidate_topics)  # select the index with highest prob
        if best_topic_index == -1:
            print("no topic document:", id)
            topic_word_list_result[id] = ["unknown"]
        else:
            topic_word_list_result[id] = topic_dict[
Ejemplo n.º 6
0
    docs_per_topic  = defaultdict(list)

    # Create a dictionary with docs per topic
    for topic_id in topics:
        # Uncommented following line if you want to return actual docs
        #docs_per_topic[ topic_id ].append(X[np.where(argmax == topic_id)])
        docs_per_topic[ topic_id ].append(np.where(argmax == topic_id))
    return docs_per_topic


# Initialization of term dictionary; in this case for 9 2 a 3-word documents
print(common_texts)
common_dictionary = Dictionary(common_texts)
print(common_dictionary)  # prints 12 unique tokens
print(common_dictionary.keys())  # prints 12 unique tokens

# Create a corpus from a list of texts
# BoW: Bag-of-Words representation for each document: (token_id[int], token_count[float]) tuples
X = [common_dictionary.doc2bow(text) for text in common_texts]
print("X: ", X)
# X can also be a sparse matrix with (n_docs, n_terms); that may be handier

lda = LdaModel(corpus=X, num_topics = 10, alpha='symmetric')

# Full document-term matrix
print(lda.get_topics())

# TODO figure out how to use LdaState; need to initialize it first I guess
# Get posterior probabilities over topics
#print(lda.LdaState().get_lambda())
Ejemplo n.º 7
0
    ])  # Bigrams and trigrams are joined by underscores
# -

# ### Remove rare and common tokens, and limit vocabulary

# +
dictionary = Dictionary(all_tokens)
dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=20000)

# Look at the top 100 and bottom 100 tokens

temp = dictionary[0]  # Initialize the dict

token_counts = pd.DataFrame(np.array(
    [[token_id, dictionary.id2token[token_id], dictionary.cfs[token_id]]
     for token_id in dictionary.keys() if token_id in dictionary.cfs.keys()
     and token_id in dictionary.id2token.keys()]),
                            columns=['id', 'token', 'count'])

token_counts['count'] = token_counts['count'].astype('int')
token_counts['count'].describe()
token_counts = token_counts.sort_values('count')

plt.rcParams.update({'figure.figsize': (5, 3.5), 'figure.dpi': 200})
token_counts['count'].head(5000).hist(bins=100)
plt.suptitle("Counts for 5,000 least frequent included words")
plt.show()
display(token_counts.head(50))

plt.rcParams.update({'figure.figsize': (5, 3.5), 'figure.dpi': 200})
token_counts['count'].tail(1000).hist(bins=100)