def cluster_questions(topic_num,
                      res_path,
                      q_path='datasets\DialogQA\Qall.txt',
                      a_path='datasets\DialogQA\Aall.txt'):
    with open(a_path, 'r', encoding='utf-8') as f:
        common_texts = [text.split() for text in f.readlines()]

    with open(q_path, 'r', encoding='utf-8') as f:
        questions = [text for text in f.readlines()]

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    lda = LdaModel(common_corpus, num_topics=topic_num)

    questions_clusterd = [[] for i in range(topic_num)]
    print('Questions : ', len(questions))
    perp = lda.log_perplexity(common_corpus)
    for i, q in enumerate(questions):
        other_corpus = [common_dictionary.doc2bow(common_texts[i])]
        vector = lda[other_corpus]
        # print(vector[0])
        max_prob = 0
        for (idx, prob) in vector[0]:
            # print(idx)
            if prob > max_prob:
                topic = idx
                max_prob = prob
        questions_clusterd[topic].append(q)
        # print(topic)
    if (not os._exists(res_path)):
        os.makedirs(res_path)
    for top in range(topic_num):
        with open(res_path + str(top) + '.txt', 'w', encoding='utf-8') as f:
            for quest in questions_clusterd[top]:
                f.write(quest)
                # f.write('\n')

    return perp
Esempio n. 2
0
def calcCoherence(lemmatizedTexts, passes=100, nTopics=5, workers = 1):

    id2word = Dictionary(lemmatizedTexts)
    corp = [id2word.doc2bow(text) for text in lemmatizedTexts]

    ldaModel = gensim.models.LdaMulticore(
        corpus=corp,
        id2word=id2word,
        num_topics=nTopics,
        passes=passes,
        random_state=100,
        per_word_topics=False,
        alpha=0.01,
        eta=0.9,
        workers=workers
    )

    coherenceModel = CoherenceModel(
        model=ldaModel, texts=lemmatizedTexts, dictionary=id2word, coherence='c_v', processes=0
    )

    return coherenceModel.get_coherence()
Esempio n. 3
0
def build_model(raw_file, ret_file):
    """
    :param raw_file:
    :param retfile:
    :return:
    """
    all_tweets = load_all_tweets(raw_file)
    k = int(ret_file[ret_file.find('tweets_lda_') + 11])
    print('k={}'.format(k))
    idx2twetid = []
    common_texts = []
    for key, tweet in all_tweets.items():
        idx2twetid.append(key)
        tokens = tweet['cleaned'].split(' ')
        text = []
        for token in tokens:
            if token not in punc_words:
                text.append(token)
        common_texts.append(text)

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    print('begin to train')
    lda_model = LdaModel(common_corpus,
                         id2word=common_dictionary,
                         num_topics=k,
                         random_state=13)
    pprint(lda_model.print_topics(num_words=20))
    print('\nPerplexity: ', lda_model.log_perplexity(common_corpus))

    with open(ret_file, 'w', encoding='utf-8') as fout:
        for i, tweetid in enumerate(idx2twetid):
            tmp = lda_model[common_corpus[i]]
            lda_score = {}
            for ele in tmp:
                lda_score[str(ele[0])] = float(ele[1])
            all_tweets[tweetid]['lda' + str(k)] = lda_score
            fout.write(json.dumps(all_tweets[tweetid]))
            fout.write('\n')
Esempio n. 4
0
 def __init__(self,
              fname,
              processes=None,
              dictionary=None,
              filter_namespaces=('0', )):
     """
     Initialize the corpus. Unless a dictionary is provided, this scans the
     corpus once, to determine its vocabulary.
     If `pattern` package is installed, use fancier shallow parsing to get
     token lemmas. Otherwise, use simple regexp tokenization. You can override
     this automatic logic by forcing the `lemmatize` parameter explicitly.
     """
     self.fname = fname
     self.filter_namespaces = filter_namespaces
     self.metadata = False
     if processes is None:
         processes = max(1, multiprocessing.cpu_count() - 1)
     self.processes = processes
     if dictionary is None:
         self.dictionary = Dictionary([[]])
     else:
         self.dictionary = dictionary
def main():
    client = MongoClient('localhost', 27017)
    db = client["discursoDB"]
    discursos = db["discursos"]

    # print(discursos.find()[0]['Conteudo'])

    corpus = []

    for disc in discursos.find():
        discurso_text = disc["Conteudo"]
        corpus.append(discurso_text)

    print(len(corpus))

    # Create a corpus from a list of texts
    common_dictionary = Dictionary(common_texts)

    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    # Train the model on the corpus.
    lda = LdaMulticore(common_corpus, num_topics=10)
Esempio n. 6
0
def create_dictionaries(data, model, feature):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
    w2idx = {v: k + 1 for k, v in gensim_dict.items()}
    w2idxl = {v.lower(): k + 1 for k, v in gensim_dict.items()}
    #w2vec = {word: model[word.lower()] for word in w2idx.keys()}
    w2vec = {}
    for word in w2idx.keys():
        if feature == 'bow':
            try:
                w2vec[word.lower()] = model[word]
            except KeyError:
                w2vec[word.lower()] = [0] * model.vector_size
        else:
            try:
                w2vec[word] = model[word]
            except KeyError:
                w2vec[word] = [0] * model.vector_size

    def parse_dataset(data, feature):
        for key in data.keys():
            if feature == 'bow':
                txt = data[key].lower().replace('\n', '').split()
            else:
                txt = data[key].replace('\n', '').split()
            new_txt = []
            for word in txt:
                try:
                    if feature == 'bow':
                        new_txt.append(w2idxl[word])
                    else:
                        new_txt.append(w2idx[word])
                except:
                    new_txt.append(0)
            data[key] = new_txt
        return data

    out = parse_dataset(data, feature)
    return w2idx, w2vec, out
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
        4- 返回所有词语的向量的拼接结果
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        # 获取keys集合,字典的单词集合
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # 获取word_index=>index集合
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # 获取word=>词向量集合
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                sentences = sentence.split(' ')
                for word in sentences:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # pad 补上0
        combined = sequence.pad_sequences(combined)
        global input_length
        input_length = len(combined[0])
        return w2indx, w2vec, combined
    else:
        print('error: 模型或者和并集合combined 为空')
Esempio n. 8
0
def get_train_data(filename_list, target_qq=[], min_len=4):
    for filename in filename_list:
        if not os.path.isfile(filename):
            return
    
    current_qq = '--'
    is_target = False
    chat_record = []
    for filename in filename_list:
        with open(filename, 'r', encoding='utf-8') as history:
            for line in history:
                line = line.strip().replace('\n', '')
                for word in stopword:
                    line = line.replace(word, '')
                line = line.lower()
                if line.find('http') >= 0:
                    continue
                header = HEADER.match(line)
                if not header:
                    header = HEADER_MAIL.match(line)
                if header:
                    _, current_qq = header.groups()
                    if not target_qq:
                        is_target = True if current_qq in qq_filter else False
                    else:
                        is_target = current_qq in target_qq
                elif is_target and line:
                    line = standalize(line)
                    record = list(jieba.cut(line)) + ['<eos>']
                    record = [word for word in record if word != ' ']
                    if len(record) <= min_len:
                        continue
                    chat_record.append(record)
                    if len(chat_record) >= 100000:
                        break
    
    dict = Dictionary(chat_record)
    
    return dict, chat_record
Esempio n. 9
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        #词典Dictionary(),词向量表model.vocab.keys(),
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Esempio n. 10
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1-创建索引映射的单词
        2-创建一个单词到矢量映射
        3-转换训练和测试词典

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
Esempio n. 11
0
def create_dictionaries(model=None, combined=None):
    """ 
    Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    """
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()

        gensim_dict.doc2bow(reduce(lambda x, y: x + y, combined),
                            allow_update=True)

        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Esempio n. 12
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
        4- 返回所有词语的向量的拼接结果
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        # keys
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                sentences = sentence.split(' ')
                for word in sentences:
                    try:
                        #word = np.unicode(word, errors='ignore')
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # combined = sequence.pad_sequences(combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        combined = sequence.pad_sequences(
            combined)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Esempio n. 13
0
    def build_vocab(
            self,
            documents_tokens: List[List[str]]) -> Tuple[List[List[str]], Dict]:
        """
        Build vocabualry.

        :param documents_tokens: documents as list of tokens, e.g. [
            ['the', 'brown', 'fox'],
            ['another', 'word', ..],
            ...
        ]

        :returns: a tuple consisting of list of documents as word counts (Bag-of-words),
        and Id2Word dictionary.
        """
        LOGGER.info('Fitting bigram model..')
        bigram = Phrases(documents_tokens,
                         min_count=self.min_df,
                         threshold=100,
                         progress_per=100,
                         common_terms=self.stop_words)

        self.bigram_model = Phraser(bigram)

        LOGGER.info('Fitting trigram model..')
        self.trigram_model = Phraser(
            Phrases(bigram[documents_tokens], threshold=100))

        documents_trigrams = []

        LOGGER.info('Creating trigrams..')
        for index in range(len(documents_tokens) - 1, -1, -1):
            documents_trigrams.append(
                self.create_trigrams(documents_tokens[index]))
            documents_tokens.pop()

        id2word = Dictionary(documents_trigrams)
        return [id2word.doc2bow(text) for text in documents_trigrams], id2word
Esempio n. 14
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        # the index of a word which have word vector is not 0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # integrate all the corresponding word vectors into the word vector matrix
        w2vec = {word: model[word] for word in w2indx.keys()}

        # a word without a word vector is indexed 0,return the index of word
        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in list(sentence):
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # unify the length of the sentence with the pad_sequences function of keras
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # return index, word vector matrix and the sentence with an unifying length and indexed
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Esempio n. 15
0
    def buildDict(self, no_below=3, no_above=0.7, keep_n=5000):
        from gensim.corpora.dictionary import Dictionary
        if 'GENSIM_DICT' in self.config:
            no_below = int(self.config['GENSIM_DICT'].get('no_below', 3))
            no_above = float(self.config['GENSIM_DICT'].get('no_above', 0.7))
            keep_n = int(self.config['GENSIM_DICT'].get('keep_n', 5000))

        ori_pp_mode = copy.deepcopy(self.postProcessor.postProcessMethod)
        ori_go_postprocess = copy.deepcopy(self.goPoseprocessor)
        self.postProcessor.postProcessMethod = 'postProcess4Dict'
        self.goPoseprocessor = True
        self._reset_iter()
        #print(next(self))
        self.gensim_dict = Dictionary(self)
        self._reset_iter()
        self.postProcessor.postProcessMethod = ori_pp_mode
        self.goPoseprocessor = ori_go_postprocess
        self.gensim_dict.filter_extremes(no_below=no_below,
                                         no_above=no_above,
                                         keep_n=keep_n)

        if self.postProcessor:
            self.postProcessor.gensim_dict = self.gensim_dict
Esempio n. 16
0
def preprocessingGensim(observations):
    logging.info('Begin preprocessingGensim')
    
    
    observations['tf-idf'] = ""

    
    # Create a Corpus
    dictionary = Dictionary(observations["lemmatized"].tolist())
    corpus = [dictionary.doc2bow(text) for text in observations['lemmatized'].tolist()]
        
    # Create a new TfidfModel using the corpus
    tfidf = TfidfModel(corpus)
    
    for index, row in observations.iterrows():
        
        tfidf_weights = tfidf[corpus[index]]
        sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
        observations.loc[index, 'tf-idf'] = sorted_tfidf_weights
        

    logging.info('End preprocessingGensim')
    return observations
Esempio n. 17
0
def lda_vector(dataset: list, refer_dictionary=None, refer_lda_model=None):

    if refer_dictionary is None:
        refer_docs = [
          [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False
           and token not in [',', '.', '?']] for sample in dataset
        ]
        refer_dictionary = Dictionary(refer_docs)
        refer_doc2bow = [refer_dictionary.doc2bow(text) for text in refer_docs]
        refer_lda_model = LdaModel(corpus=refer_doc2bow, id2word=refer_dictionary, num_topics=10, dtype=np.float64, passes=10, minimum_probability=0.0)

    doc = [
        [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False
         and token not in [',', '.', '?']] for sample in dataset
    ]
    doc_bow_s = [refer_dictionary.doc2bow(text) for text in doc]
    doc_vecs = [refer_lda_model[doc_bow] for doc_bow in doc_bow_s]

    for (sample, doc_vec) in zip(dataset, doc_vecs):
        for topic_prob in doc_vec:
            sample['topic'+str(topic_prob[0] + 1)] = topic_prob[1]

    return refer_dictionary, refer_lda_model
Esempio n. 18
0
    def __init__(self,
                 fname,
                 processes=None,
                 lemmatize=utils.HAS_PATTERN,
                 dictionary=None):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.

        If `pattern` package is installed, use fancier shallow parsing to get
        token lemmas. Otherwise, use simple regexp tokenization. You can override
        this automatic logic by forcing the `lemmatize` parameter explicitly.

        """
        self.fname = fname
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
Esempio n. 19
0
def topic_list(text):
    print 'Topic modeling...'

    tokenizer = RegexpTokenizer('\w+')
    document = []
    for token in tokenizer.tokenize(text):
        word = token.lower()
        if word not in stop_words:
            document.append(word)
    documents = [document]

    dic = Dictionary(documents)
    corpus = [dic.doc2bow(doc) for doc in documents]

    lda = LdaModel(corpus, num_topics=5)

    topics = [
        dic[int(id)] for topic in lda.show_topics(formatted=False)
        for prob, id in topic
    ][:5]

    print topics
    return topics
Esempio n. 20
0
def create_corpus(documents,
                  field='text',
                  normalizing='lemmatize',
                  language=DEFAULTLANGUAGE):
    """
    :param documents: an iterable of documents (dictionaries)
    :param field: the field from which to extract data
    :param normalizing: if 'lemmatize' then perfoms word net lemmatization with the default pos noun ('n') NOTE: only supported for english
                        if 'stem' perform stemming with the porter stemmer
                        else uses the input words as they are.
    """
    print('Creating corpus ...')
    print('caching token represetation from documents ...')
    token_lists = [[
        word for word in generate_word(
            doc_data, normalize=normalizing, language=language)
    ] for doc_data in get_data_generator(documents, field=field)]

    vocabulary = Dictionary(token_lists)
    corpus = [vocabulary.doc2bow(token_list) for token_list in token_lists]
    #    gensim.corpora.MmCorpus.serialize('/tmp/lda.mm', corpus)

    return vocabulary, corpus
Esempio n. 21
0
def get_corpus_dictionary():
    """Crafts a toy corpus and the dictionary associated."""
    corpus = [
        ['carrot', 'salad', 'tomato'],
        ['carrot', 'salad', 'dish'],
        ['tomato', 'dish'],
        ['tomato', 'salad'],

        ['car', 'break', 'highway'],
        ['highway', 'accident', 'car'],
        ['moto', 'break'],
        ['accident', 'moto', 'car']
    ]
    dictionary = Dictionary(corpus)

    # Transforming corpus with dictionary.
    corpus = [dictionary.doc2bow(doc) for doc in corpus]

    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token

    return corpus, dictionary
Esempio n. 22
0
    def buildDic(self, model=None, words=None):
        '''
        构建词典,
        :param model:   word2vec模型
        :param words:   结巴分词后所有的文本内容
        :return:        返回每个词语的索引(词语-索引),词向量(词语-向量),以及每个句子所对应的词语索引(下标索引)
        '''
        if (model is not None) and (words is not None):
            # 初始化一个词典
            dict = Dictionary()
            # model.vocab.keys() 为 word2vec 中所有的词,设置 allow_update=True 则每个词出现一个,频率就会增加一次
            # 转换为词袋模型
            dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
            # 重新生成字典:key 是单词,value 是单词对应的下标。其中 k 为下标索引,v 为 字典中包含的词,
            w2indx = {v: k + 1 for k, v in dict.items()}
            # key 是单词,value 是对应的词向量
            w2vec = {word: model[word] for word in w2indx.keys()}

            # 获取一句话所对应的词语索引
            def parseDataset(words):
                data = []
                for sentence in words:
                    new_txt = []
                    for word in sentence:
                        try:
                            new_txt.append(w2indx[word])
                        except:
                            new_txt.append(0)
                    data.append(new_txt)
                return data

            combined = parseDataset(words)
            # 对长短不同的时序统一维度。
            combined = sequence.pad_sequences(combined, maxlen=self.maxlen)
            return w2indx, w2vec, combined
        else:
            print("模型或数据导入失败")
Esempio n. 23
0
def preprocess(tweets):
    # Get only negative ones (for this task)
    newTweets = tweets.copy()

    newTweets = remove_airline_tags(newTweets)
    newTweets.text = remove_links(newTweets.text)
    newTweets.text = lt_gt_conversion(
        ampersand_conversion(arrow_conversion(newTweets.text)))
    newTweets.text = with_without_conversion(newTweets.text)
    newTweets.text = hashtag_to_words(newTweets.text)
    newTweets = translate_all_emoji(newTweets)
    newTweets.text = remove_contractions(newTweets.text)
    newTweets.text = remove_punctuation(newTweets.text)
    newTweets.text = lemmatize_texts(newTweets.text)
    newTweets.text = remove_stopwords(newTweets.text)
    newTweets.text = newTweets.text.str.lower()
    texts = newTweets["text"].values

    # Tokenize and remove short words or filtered words
    tokenized_texts = []
    for text in texts:
        split_text = text.split()
        split_text = [
            word for word in split_text
            if len(word) > 2 and word not in FILTERED_WORDS
        ]
        tokenized_texts.append(split_text)

    # Create a dictionary for each word, and a bag of words
    text_dictionary = Dictionary(tokenized_texts)

    # Remove words that appear in over 50%, or less than 0.5%, and keep the top 66% of the vocabulary
    text_dictionary.filter_extremes(no_below=5,
                                    no_above=0.5,
                                    keep_n=len(text_dictionary) // 2)
    text_corpus = [text_dictionary.doc2bow(text) for text in tokenized_texts]
    return (text_dictionary, text_corpus)
def create_dictionaries(model=None, combined=None):
    ''' 这个函数做3件事
        1- 创建一个单词到索引的映射
        2- 创建一个单词到词向量的映射
        3- 对训练集和测试集的词典进行转换
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  词频小于10->0 所以v->k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用, 把combined中的词语转换成对应的索引
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # 词频小于10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Esempio n. 25
0
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' 单词变集合
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0

        f12.write(str(combined))
        f12.write('\n')

        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
def main():
    
    articles_path = '/texts_corrected/*.txt'
    stopword_path = '/stopwords.txt'
    resultspath = '/results/'
    location_path = '/locations.txt'
    tot_topic_vectors_path = resultspath + 'time200msc_topic_vectors_beta0_1.csv'
    tot_topic_mixtures_path = resultspath + 'time200msc_topic_mixtures_beta0_1.csv'
    tot_topic_shapes_path = resultspath + 'time200msc_topic_shapes_beta0_1.csv'
    tot_pickle_path = resultspath + 'time200iter_beta0_1.pickle'
    coherence_pickle_path = resultspath + 'coherence.pickle'
    seed_file = resultspath + '/seedwords.txt'
    
    tot = stot_model()
 
    
    articles,date,vocab = tot.initDataset(articles_path, stopword_path, location_path)
    
    ##save variable for coherence measures
    dictionary = Dictionary(articles)
    corpus = [dictionary.doc2bow(article) for article in articles]
    
    coherence_pickle = open(coherence_pickle_path, 'wb')
    pickle.dump(dictionary, coherence_pickle)
    pickle.dump(corpus, coherence_pickle)
    coherence_pickle.close()
    
    #resume with modelling process
    tot.init_seedwords(seed_file, vocab)  
    param = tot.initParam(articles, date, vocab)
    theta,phi,psi = tot.TopicsOverTimeGibbsSampling(param)
    np.savetxt(tot_topic_vectors_path, phi, delimiter=',')
    np.savetxt(tot_topic_mixtures_path, theta, delimiter=',')
    np.savetxt(tot_topic_shapes_path, psi, delimiter=',')
    tot_pickle = open(tot_pickle_path, 'wb')
    pickle.dump(param, tot_pickle)
    tot_pickle.close()
def create_dictionaries(train=None, test=None, model=None):
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data

        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  # 所有频数超过10的词语的索引,(k->v)=>(v->k)
        f = open("word2index.txt", 'w', encoding='utf8')
        for key in w2indx:
            f.write(str(key))
            f.write(' ')
            f.write(str(w2indx[key]))
            f.write('\n')
        f.close()
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  # 所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)  # [[1,2,3...],[]]
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
Esempio n. 29
0
    def buildDict(self):
        batchiter = BatchIterBert(self.trainDataIter,
                                  filling_last_batch=False,
                                  postProcessor=xonlyBatchProcessor,
                                  batch_size=1)
        common_dictionary = Dictionary(batchiter)
        print(len(common_dictionary))
        if self.testReaderargs:
            print('update vocab from test set')
            batchiter = BatchIterBert(self.testDataIter,
                                      filling_last_batch=False,
                                      postProcessor=xonlyBatchProcessor,
                                      batch_size=1)
            common_dictionary.add_documents(batchiter)
            print(len(common_dictionary))

        common_dictionary.filter_extremes(no_below=self.dict_no_below,
                                          no_above=self.dict_no_above,
                                          keep_n=self.dict_keep_n)
        self.dictProcess = DictionaryProcess(common_dictionary)
        self.postProcessor.dictProcess = self.dictProcess
        self.vocab_dim = len(self.dictProcess)
        self.have_dict = True

        if 1:
            count_list = []
            self.trainDataIter._reset_iter()
            batchiter = BatchIterBert(self.trainDataIter,
                                      filling_last_batch=False,
                                      postProcessor=xonlyBatchProcessor,
                                      batch_size=1)
            for item in batchiter:
                current_count = sum(item)
                count_list.append(current_count)
                #print(current_count)
            print(sum(count_list) / len(count_list))
Esempio n. 30
0
 def __init__(self, data=None, dictionary=None):
     """ initialize, data should be provided, only when unpickling class object it is not needed!"""
     self.data = data
     self.model = None
     self.num_topics = None
     self.iterations = None
     self.random_state = None
     self.dictionary = dictionary
     if self.data is not None:
         if self.dictionary is None:
             self.dictionary = Dictionary(self.data)
         self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
     else:
         self.dictionary = None
         self.corpus = None
     self.distributed = None
     self.chuncksize = None
     self.passes = None
     self.update_every = None
     self.alpha = None
     self.eta = None
     self.decay = None
     self.offset = None
     self.eval_every = None
     self.gamma_threshold = None
     self.minimum_probability = None
     self.ns_conf = None
     self.minimum_phi_value = None
     self.per_word_topics = None
     self.num_topics = None
     self.iterations = None
     self.random_state = None
     self.model = None
     self.coherence_model = None
     self.coherence = None
     self.coherence_type = None