def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Example #2
0
def train_lda():
	"""
	Usage: python Wechat_LDA.py wechat.csv
	"""
	with open(sys.argv[1], 'r') as wx:
		for f in wx:
			seg = jieba.cut(f)
			seg = [word for word in seg if word not in stopwords]
			with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg:
				wx_seg.write(' '.join(seg))

	documents = open('wechat_seg.txt', 'r')
	dictionary = corpora.Dictionary(LineSentence(documents))
	corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)]
	tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True)
	tfidf_model.save('wechat_seg.txt.tfidf_model')
	# corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus])
	lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1)
	lda_model.save('wechat_lda_model.pkl')

	topics = []
	for doc in corpus:
		topics.append(lda_model[doc])

	counts = np.zeros(100)
	for top_doc in topics:
		for ti, _ in top_doc:
			counts[ti] += 1

	words = lda_model.show_topic(counts.argmax(), 64)
	with open('top_words.txt', 'w') as tw:
		writer = UnicodeWriter(tw)
		for w in words:
			writer.writerow((w[0], int(float(w[1])*1000)))
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #4
0
def make_corpus():
    corpus = MyCorpus()
    tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    num_terms = 400
    lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms)
    # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use
    corpus.dictionary.save(os.path.join(HERE, "sogou.dict"))  # store the dictionary, for future reference
    tfidf_model.save(os.path.join(HERE, "sogou.model"))
    lsi_model.save(os.path.join(HERE, "sogou.lsi"))
    print "save dictionary and tfidf model"
    """    
def apply_tfidf(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(mm_corpus_path)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm',
                       tfidf[mm],
                       progress_cnt=10000)
Example #6
0
def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel
Example #7
0
def create_movie_profile(movie_dataset):
    '''
    使用tfidf,分析提取topn关键词
    :param movie_dataset:
    :return:
    '''
    dataset = movie_dataset["tags"].values

    from gensim.corpora import Dictionary
    # 根据数据集建立词袋,并统计词频,将所有词放入一个词典,使用索引进行获取
    dct = Dictionary(dataset)
    # 根据将每条数据,返回对应的词索引和词频
    corpus = [dct.doc2bow(line) for line in dataset]
    # 训练TF-IDF模型,即计算TF-IDF值
    model = TfidfModel(corpus)

    _movie_profile = []
    for i, data in enumerate(movie_dataset.itertuples()):
        mid = data[0]
        title = data[1]
        genres = data[2]
        vector = model[corpus[i]]
        movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
        topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags))
        # 将类别词的添加进去,并设置权重值为1.0
        for g in genres:
            topN_tags_weights[g] = 1.0
        topN_tags = [i[0] for i in topN_tags_weights.items()]
        _movie_profile.append((mid, title, topN_tags, topN_tags_weights))

    movie_profile = pd.DataFrame(
        _movie_profile, columns=["movieId", "title", "profile", "weights"])
    movie_profile.set_index("movieId", inplace=True)
    return movie_profile
Example #8
0
class Cos():
    def __init__(self):
        self.tfidf = {}
        self.dict = Dictionary()

    def init(self, traindata, dict_path, tfidf_path):
        self.dict = Dictionary(traindata)  # fit dictionary
        corpus = [self.dict.doc2bow(line) for line in traindata]  # convert corpus to BoW format
        self.tfidf = TfidfModel(corpus)  # fit model
        self.dict.save(dict_path)
        self.tfidf.save(tfidf_path)


    def load(self, dict_path, tfidf_path):
        self.dict = Dictionary.load(dict_path)
        self.tfidf = TfidfModel.load(tfidf_path)
Example #9
0
def getSparseMatrixSimilarity(keyword, texts):

    # 1、将【文本集】生成【分词列表】
    texts = [jieba.lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]
    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)
    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]
    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    for e, s in enumerate(similarities, 1):
        print('kw 与 text%d 相似度为:%.2f' % (e, s))

    print(sparse_matrix)
    print(similarities)
Example #10
0
    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for data in self.valid + self.non_valid:
            all_words.append(data["title"] + data["content"])
        vocab = Dictionary(all_words)
        raw_vocab_size = len(vocab)

        vocab.filter_extremes(no_below=5)
        vocab.filter_extremes(keep_n=max_vocab_cnt)
        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["a", "i"] and True or False, vocab.values()))
        vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words)))
        if self.config.use_dict == "seq" and self.config.enable_pad:
            vocab.token2id[PAD] = len(vocab)
            vocab.compactify()
            self.pad_wid = vocab.token2id.get(PAD)
        self.vocab_seq = vocab  # seq dictionary
        # build bow dictionary
        self.vocab_bow = copy.deepcopy(vocab)
        self.vocab_bow.filter_tokens(
            map(self.vocab_bow.token2id.get, STOPWORDS))  # filter stop words
        self.vocab_bow.compactify()
        if self.config.tfidf:
            tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words]
            self.tfidf_model = TfidfModel(tfidf_corpus)
        print("Load corpus with non_valid size %d, valid size %d, "
              "raw vocab size %d seq vocab size %d, bow vocab size %d" %
              (len(self.non_valid), len(self.valid), raw_vocab_size,
               len(self.vocab_seq), len(self.vocab_bow)))
Example #11
0
def cluster_data(state):
    # Bigram model
    data_words_bigrams = make_bigrams(state)
    INPUT = data_words_bigrams
    # Create Dictionary
    id2word = corpora.Dictionary(INPUT)
    # Create Corpus
    texts = INPUT
    # Filter out words that occur less than and greater than
    id2word.filter_extremes(no_below=state.no_below, no_above=state.no_above)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    TOPICS_LIST = range(1, state.lda_topics + 1)
    lda_models = []
    coherence_scores = []
    for TOPICS in TOPICS_LIST:
        lda_model = run_LDA_model(corpus, id2word, TOPICS)
        lda_models.append(lda_model)

        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=data_words_bigrams,
                                             dictionary=id2word,
                                             coherence='c_v')
        score = coherence_model_lda.get_coherence()
        coherence_scores.append(score)

    return coherence_scores, lda_models, corpus
Example #12
0
def get_matrix_pinyin(pos_path="data/samples/positive.txt",
                      neg_path="data/samples/negative.txt"):
    from xpinyin import Pinyin
    dataset = []
    pin = Pinyin()
    with open(pos_path, encoding='utf8') as f:
        dataset += [
            pin.get_pinyin(line, '').split() for line in f if line != '\n'
        ]
        pos_len = len(dataset)
        print("positive matrix length", pos_len)
    with open(neg_path, encoding='utf8') as f:
        dataset += [
            pin.get_pinyin(line, '').split() for line in f if line != '\n'
        ]
        neg_len = len(dataset) - pos_len
        print("negative matrix length", neg_len)
    dct = Dictionary(dataset)
    print("dictionary length", len(dct))
    corpus = [dct.doc2bow(line) for line in dataset]
    model = TfidfModel(corpus)
    pos_matrix = np.zeros((pos_len, len(dct)))
    neg_matrix = np.zeros((neg_len, len(dct)))
    for i, line in enumerate(model[corpus][:pos_len]):
        for j, n in line:
            pos_matrix[i, j] = n
    for i, line in enumerate(model[corpus][pos_len:]):
        for j, n in line:
            neg_matrix[i, j] = n
    print("get matrix completed")
    return pos_matrix, neg_matrix
Example #13
0
    def compute_sim_matrix(self):
        '''    
        if(self.model_type.lower() == "fasttext"):
            model = FastText(self.questions) 
        else:
            model = Word2Vec(self.questions)
        '''
        self.dictionary = Dictionary(self.questions)
        self.tfidf = TfidfModel(dictionary=self.dictionary)
        word2vec_model = Word2Vec(self.questions,
                                  workers=cpu_count(),
                                  min_count=5,
                                  size=300,
                                  seed=12345)

        sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv)
        sim_matrix = SparseTermSimilarityMatrix(sim_index,
                                                self.dictionary,
                                                self.tfidf,
                                                nonzero_limit=100)
        bow_corpus = [
            self.dictionary.doc2bow(document) for document in self.questions
        ]

        tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus]

        self.docsim_index = SoftCosineSimilarity(tfidf_corpus,
                                                 sim_matrix,
                                                 num_best=10)
Example #14
0
def sim_calculator(DF, column_name):

    print("Number of {}: {}".format(column_name,len(DF[column_name])))

    #Preprocessing
    print('\nCreating Dictionary...')
    processed_docs = DF[column_name].map(preprocess)

    #Generating dictionary
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=100,no_above=0.9, keep_n=100000)
    print('Dictionary created')
    print("Size of vocabularly: ",len(dictionary))

    #Bag of words
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    #Tfid vectorization
    print('\nRunning TFIDF vectorization...')
    model2 = TfidfModel(bow_corpus) 
    abs_tfidf=model2[bow_corpus]
    print('TFIDF complete')

    #Calculating similarties
    print('\nCalculating Similarity Matrix...')
    index = similarities.MatrixSimilarity(abs_tfidf)
    sims = index[abs_tfidf]
    print("size of similarity matrix: ", sims.shape)

    return sims
Example #15
0
def text_matching_tfidf(text, candidate_texts, top_n=1):
    """
    文本匹配:基于TF-IDF
    :param text:
    :param candidate_texts:
    :param top_n:
    :return:
    """
    text_cut = jieba.lcut(text)
    candidate_texts_cut = [jieba.lcut(item) for item in candidate_texts]
    # todo: 可以选择去一下停用词

    dct = Dictionary(candidate_texts_cut)
    dct_size = len(dct.token2id.keys())
    corpus_bow = [dct.doc2bow(item) for item in candidate_texts_cut]

    tfidf_model = TfidfModel(corpus_bow, dictionary=dct)
    corpus_tfidf = tfidf_model[corpus_bow]

    similarity = SparseMatrixSimilarity(corpus_tfidf, num_features=dct_size)

    text_bow = dct.doc2bow(text_cut)
    text_tfidf = tfidf_model[text_bow]
    cosine_similarities = similarity[text_tfidf]
    sims_argsort = (-cosine_similarities).argsort()[:top_n]

    return [(candidate_texts[idx], cosine_similarities[idx])
            for idx in sims_argsort]
Example #16
0
def main():
    """
    Executes all the scripts defined above
    """
    nlp = spacy.load("en_core_web_sm")
    sop_df = pd.read_csv('data/interim/sop_types_valid.csv',
                         converters={
                             'juri': eval,
                             'filename': eval
                         })
    type_list = sop_df['type']

    try:
        calltaker_all = pd.read_csv('data/interim/calltaker_all.csv',
                                    converters={'sop': eval})
    except:
        calltaker_all = load_event_types_for_role(sop_df, type_list,
                                                  'call taker')

    save_df(calltaker_all, 'calltaker_all.csv')
    doc_term_bow, corpus, dictionary = get_dct_dtmatrix(
        nlp, calltaker_all['sop'])
    tfidf_type = TfidfModel(doc_term_bow)
    tfidf_mtx = bow2tfidf(doc_term_bow, tfidf_type)

    km_alltype = KMeans(n_clusters=87, random_state=911).fit(tfidf_mtx)
    type_topics_kmeans_tfidf = calltaker_all.copy()
    type_topics_kmeans_tfidf['cluster'] = km_alltype.labels_
    type_topics_kmeans_tfidf = type_topics_kmeans_tfidf.sort_values(
        by=['cluster', 'type', 'juri'], ignore_index=True)
    type_topics_kmeans_tfidf.to_csv(
        'data/interim/type_topics_kmeans_tfidf.csv', index=False)
Example #17
0
def build_tfidf_or_lsi(corpus, method='tfidf'):
    '''

    построение модели для ранжирования документов.
    На вход: корпус текстов и метод ("tfidf" или "lsi").
    На выход кортеж: (словарь
    терминов в корпусе текстов,
    оцененная модель и матрица сходств слов)

    '''

    dictionary = Dictionary(corpus)
    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]
    model_tfidf = TfidfModel(corpus_bow)
    corpus_tfidf = [model_tfidf[doc] for doc in corpus_bow]
    simil_tfidf = MatrixSimilarity(corpus_tfidf)
    if method == 'tfidf':

        return dictionary, model_tfidf, simil_tfidf

    elif method == 'lsi':

        model_lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50)
        corpus_lsi = [model_lsi[doc] for doc in corpus_bow]
        simil_lsi = MatrixSimilarity(corpus_lsi)

        return dictionary, model_lsi, simil_lsi
Example #18
0
    def train(self):
        self.process_dataset(self.training_path, True)
        self.process_dataset(self.training_path, False)

        self.training_sources_length = len(self.sources)
        self.logger.debug(
            f'After train set processing: sources len {len(self.sources)}, labels len {len(self.labels)}'
        )

        self.process_dataset(self.test_path, True)
        self.process_dataset(self.test_path, False)

        self.logger.debug(
            f'After full processing: sources len {len(self.sources)}, labels len {len(self.labels)}'
        )

        corpus = Texts(self.sources).to_vector()
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        self.training_text_matrix = corpus2dense(corpus,
                                                 num_terms=len(
                                                     dictionary.token2id)).T

        if self.pca:
            self.training_text_matrix = self.pca.fit_transform(
                self.training_text_matrix)

        self.classifier.fit(
            self.training_text_matrix[:self.training_sources_length],
            self.labels[:self.training_sources_length])

        self.is_trained = True
Example #19
0
def main(JDK, url, title, query):
    dictionary = corpora.Dictionary.load(
        './TFIDF_Word2Vec/data/tfidf-w2v_dictionary.dict')
    tfidf = TfidfModel.load('./TFIDF_Word2Vec/data/tfidf.model')
    word2vec = gensim.models.keyedvectors.Word2VecKeyedVectors.load(
        './TFIDF_Word2Vec/data/word2vec.model')
    tfidf_w2v_model = models.keyedvectors.Word2VecKeyedVectors.load(
        './TFIDF_Word2Vec/data/tfidf-w2v.model')

    query_vec = get_tfidf_w2v_vec(query, dictionary, tfidf, word2vec)
    full_entity_score_vec = tfidf_w2v_model.similar_by_vector(query_vec,
                                                              topn=False)
    sort_sims = sorted(enumerate(full_entity_score_vec),
                       key=lambda item: -item[1])

    result = []
    for i in range(10):
        dic = {
            'url': url[sort_sims[i][0]].strip('\n'),
            'JDK': JDK[sort_sims[i][0]].strip('\n'),
            'title': title[sort_sims[i][0]].strip('\n'),
            'score': sort_sims[i][1]
        }
        result.append(dic)

    return result
def create_data(corpus_path):#构建数据,先后使用doc2bow和tfidf model对文本进行向量表示
    sentences = []
    sentence_dict={}
    count=0
    for line in open(corpus_path):
       # print line
        line = line.strip().split('\t')
        # print(line)
        if len(line) == 2:
            sentence_dict[count]=line[1]
            count+=1
            sentences.append(line[1].split(' '))
        else:
            break
    # print(sentence_dict)
    print(sentences)
    #对文本进行处理,得到文本集合中的词表
    dictionary = corpora.Dictionary(sentences)
    # print(dictionary)
    #利用词表,对文本进行cbow表示
    corpus = [dictionary.doc2bow(text) for text in sentences]
    print(corpus)
    #利用cbow,对文本进行tfidf表示
    tfidf=TfidfModel(corpus)
    corpus_tfidf=tfidf[corpus]
    # print(corpus_tfidf)
    return sentence_dict,dictionary,corpus,corpus_tfidf
Example #21
0
 def fit(self, X, y=None):
     """
     Fit the model according to the given training data.
     """
     self.gensim_model = TfidfModel(corpus=X, id2word=self.id2word, dictionary=self.dictionary,
         wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize)
     return self
Example #22
0
 def setUp(self):
     self.cls = similarities.SoftCosineSimilarity
     self.tfidf = TfidfModel(dictionary=dictionary)
     similarity_matrix = scipy.sparse.identity(12, format="lil")
     similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5
     similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5
     self.similarity_matrix = similarity_matrix.tocsc()
Example #23
0
    def __init__(self, kp_archives_by_paperid, kp_archives_by_userid):

        self.dictionary = corpora.Dictionary()

        # self.bow_by_userid = defaultdict(Counter)
        # self.bow_by_paperid = defaultdict(Counter)

        self.all_documents = []

        self.kp_archives_by_paperid = kp_archives_by_paperid
        self.kp_archives_by_userid = kp_archives_by_userid

        for archive in self.kp_archives_by_paperid.values():
            for token_list in archive:
                self.dictionary.add_documents([token_list])
                self.all_documents += [token_list]

        for archive in self.kp_archives_by_userid.values():
            for token_list in archive:
                self.dictionary.add_documents([token_list])
                self.all_documents += [token_list]

        self.corpus_bows = [
            self.dictionary.doc2bow(doc) for doc in self.all_documents
        ]
        self.tfidf = TfidfModel(self.corpus_bows)
Example #24
0
def create_doc_term_matrix(docs,
                           id2word,
                           tfidf=False,
                           logentropy=False,
                           random_projections=False):
    doc_term_matrix = [id2word.doc2bow(doc) for doc in docs]
    _save_model2(doc_term_matrix, 'doc_term_matrix')

    if random_projections:
        rp_model = RpModel(corpus=doc_term_matrix,
                           id2word=id2word,
                           num_topics=params['num_topics'])
        doc_term_matrix = rp_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_random_projections')

    if tfidf:
        tfidf_model = TfidfModel(id2word=id2word,
                                 corpus=doc_term_matrix,
                                 normalize=True)
        doc_term_matrix = tfidf_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_tfidf')

    if logentropy:
        log_model = LogEntropyModel(corpus=doc_term_matrix, normalize=True)
        doc_term_matrix = log_model[doc_term_matrix]
        _save_model2(doc_term_matrix, 'doc_term_matrix_logentropy')

    return doc_term_matrix
Example #25
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None  # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def tfidf_w2v_top5w(all_docs_prepro, id_dict):
    with open('../code/similarity/mappings/map_w2v_tfidf_5w.pkl', 'rb') as fp:
        Classes = pickle.load(fp)
    mapping = Classes['mapping']

    print('Loading Word2vec model')
    model_path = 'embedding/models/word2vec_all.model'
    model_w2v = Word2Vec.load(model_path)

    print('Loading Tfidf model')
    model_path = 'embedding/models/tfidf_all.model'
    model_tfidf = TfidfModel.load(model_path)

    dct = Dictionary(all_docs_prepro)
    corpus = [dct.doc2bow(line) for line in all_docs_prepro]

    mean_ticket_ques = top5_average('ticket_ques',
                                    corpus=corpus,
                                    dct=dct,
                                    model_w2v=model_w2v,
                                    model_tfidf=model_tfidf,
                                    id_dict=id_dict,
                                    all_docs_prepro=all_docs_prepro)

    return (mean_ticket_ques, mapping)
Example #27
0
 def __init__(self):
     self.host = 'localhost'
     self.port = 3306
     self.user = '******'
     self.password = '******'
     self.db = 'gaojiruangong'
     self.charset = 'utf8'
     db = pymysql.Connect(host=self.host,
                          port=self.port,
                          user=self.user,
                          passwd=self.password,
                          db=self.db,
                          charset=self.charset)
     cursor = db.cursor()
     query_sql = "SELECT id, api FROM apisamplecode"
     cursor.execute(query_sql)
     results = cursor.fetchall()
     all_api_name_set = set()
     for item in results:
         delete_left_brackets_api_name = item[1].split('(')[0]
         all_api_name_set.add(delete_left_brackets_api_name)
         api_name = delete_left_brackets_api_name.split('.')[-1].lower()
         api_id = item[0]
         if api_name in self.api_name_2_id.keys():
             self.api_name_2_id[api_name].append(api_id)
         else:
             self.api_name_2_id[api_name] = []
             self.api_name_2_id[api_name].append(api_id)
     self.all_qualified_api_name = list(all_api_name_set)
     self.dictionary = corpora.Dictionary.load(
         ROOT_DIR + '/output/model/tfidf/tfidf_dictionary.dict')
     self.index = similarities.Similarity.load(
         ROOT_DIR + '/output/model/tfidf/tfidf_index.index')
     self.tfidf = TfidfModel.load(ROOT_DIR +
                                  '/output/model/tfidf/tfidf.model')
Example #28
0
 def recomended_projects(self, request):
     projects = ProjectRequest.objects.all()
     project_keywords_dict = {}
     projects_dict = {}
     tags_list = []
     for project in projects:
         description = project.description
         description_keywords = get_keywords(description.replace('"', ''))
         tags = project.tags.replace('  ', ',').lower() 
         for keyword in description_keywords:
             tags += ',' + keyword[0].lower()
         tags_list.append(tags)
     df = read_frame(projects, fieldnames=['id', 'tags'], index_col=['id'])
     df['tags'] = tags_list
     keywords = df['tags'].tolist()
     keywords = [word_tokenize(keyword.lower()) for keyword in keywords]
     keywords = [no_commas(kw) for kw in keywords]
     processed_keywords = keywords
     dictionary = Dictionary(processed_keywords)
     corpus = [dictionary.doc2bow(doc) for doc in processed_keywords]
     tfidf = TfidfModel(corpus)
     sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
     top_3 = keywords_recommendation(all_projects=df, keywords=['uvg', 'gasolina', 'potente', 'mcdonald', 'mecanico', 'gg', 'carros'], number_of_hits=3, data=[dictionary, tfidf, sims])
     projects = []
     for id in top_3:
         projects.append(ProjectRequestSerializer(ProjectRequest.objects.get(pk=id)).data)
     return Response(projects)
Example #29
0
    def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print "Gathering sentences and removing stopwords"
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [
                word for word in TreebankWordTokenizer().tokenize(line.lower())
                if word not in stopwords
            ]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        # print(documents)
        print len(documents), "documents read"
        print len(self.dictionary), " unique tokens", self.dictionary
Example #30
0
def mergeTags():
    res = {}  # 创建一个空字典
    for i in range(len(displayArr)):
        texts = default_tags
        keyword = displayArr[i]
        # 1、将【文本集】生成【分词列表】
        texts = [lcut(text) for text in texts]
        # 2、基于文本集建立【词典】,并获得词典特征数
        dictionary = Dictionary(texts)
        num_features = len(dictionary.token2id)
        # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
        corpus = [dictionary.doc2bow(text) for text in texts]
        # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
        kw_vector = dictionary.doc2bow(lcut(keyword))
        # 4、创建【TF-IDF模型】,传入【语料库】来训练
        tfidf = TfidfModel(corpus)
        # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
        tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
        tf_kw = tfidf[kw_vector]
        # 6、相似度计算
        sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
        similarities = sparse_matrix.get_similarities(tf_kw)
        for e, s in enumerate(similarities, 1):
            if s > 0.5:
                # print(keyword, ' 与 ', ''.join(texts[e - 1]), ' 的相似度为: ', s)
                key = ''.join(texts[e - 1]).strip()
                res[key] = s
        arrSorted = sorted(res.items(), key=lambda item: item[1], reverse=True)
        for ind, (k, v) in enumerate(arrSorted):
            if ind == 0:
                ids = textsOld[i].strip().split('.')[0]
                textsOld[i] = textsOld[i] + '----------' + k
                # textsOld[i] = ids+'.'+k
        res = {}  #字典置空
    return textsOld
Example #31
0
    def loadmodel(self, nameprefix):
        """ Load the topic model with the given prefix of the file paths.

        Given the prefix of the file paths, load the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        :param nameprefix: prefix of the file paths
        :return: None
        :type nameprefix: str
        """
        # load the JSON file (parameters)
        parameters = json.load(open(nameprefix + '.json', 'rb'))
        self.nb_topics = parameters['nb_topics']
        self.toweigh = parameters['toweigh']
        self.algorithm = parameters['algorithm']
        self.classlabels = parameters['classlabels']

        # load the dictionary
        self.dictionary = Dictionary.load(nameprefix + '.gensimdict')

        # load the topic model
        self.topicmodel = gensim_topic_model_dict[self.algorithm].load(
            nameprefix + '.gensimmodel')

        # load the similarity matrix
        self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat')

        # load the tf-idf modek
        if self.toweigh:
            self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf')

        # flag
        self.trained = True
Example #32
0
def processing_data():
    corpus = list(
        chain(*[
            chain([
                preprocess(thread["RelQuestion"]["RelQSubject"]),
                preprocess(thread["RelQuestion"]["RelQBody"])
            ], [
                preprocess(relcomment["RelCText"])
                for relcomment in thread["RelComments"]
            ]) for thread in api.load(
                "semeval-2016-2017-task3-subtaskA-unannotated")
        ]))

    dictionary = Dictionary(corpus)
    tfidf = TfidfModel(dictionary=dictionary)
    w2v_model = Word2Vec(corpus,
                         workers=cpu_count(),
                         min_count=5,
                         size=300,
                         seed=12345)
    similarity_matrix = w2v_model.wv.similarity_matrix(dictionary,
                                                       tfidf,
                                                       nonzero_limit=100)
    pickle.dump(dictionary,
                open(r'C:\Code\201810\Similarity\data\dic_path', 'wb+'))  #字典
    pickle.dump(similarity_matrix,
                open(r'C:\Code\201810\Similarity\data\similarity_matrix_path',
                     'wb+'))  #相似度举证
    pickle.dump(tfidf, open(r'C:\Code\201810\Similarity\data\tfidf_path',
                            'wb+'))  #tfidf
def remove_rare_often_word(texts, low_value, high_value):
    #removing frequent and rare words
    texts_tokenized = [simple_preprocess(doc) for doc in texts]
    dictionary = Dictionary(texts_tokenized)
    corpus = [dictionary.doc2bow(doc) for doc in texts_tokenized]

    tfidf = TfidfModel(corpus, id2word=dictionary)
    corpus_tfidf = tfidf[corpus]

    bad_words = []
    for sent_tfidf in tqdm(corpus_tfidf, desc="selecting bad words"):
        bad_words += [
            id for id, value in sent_tfidf
            if (value < low_value) or (value > high_value)
        ]

    dictionary.filter_tokens(bad_ids=bad_words)

    out_bow = [dictionary.doc2bow(doc) for doc in texts_tokenized]

    out_corpus = []
    for doc in tqdm(out_bow, desc='Creating out corpus'):
        out_corpus.append([dictionary.get(id) for id, value in doc])

    dict_tfidf = {
        dictionary.get(id): value
        for doc in corpus_tfidf for id, value in doc
        if (value >= low_value) and (value <= high_value)
    }

    return {
        'texts': out_corpus,
        'dict_tfidf': dict_tfidf,
        'dictionary': dictionary
    }
Example #34
0
 def setUp(self):
     self.documents = [
         [u"government", u"denied", u"holiday"],
         [u"government", u"denied", u"holiday", u"slowing", u"hollingworth"]]
     self.dictionary = Dictionary(self.documents)
     self.tfidf = TfidfModel(dictionary=self.dictionary)
     self.index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5)
Example #35
0
    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)
def build_tfidf(corpus_dir,model_filename):
    stemmer = nltk.stem.PorterStemmer()
    corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$')  # a memory-friendly iterator
    dictionary = corpora.Dictionary()

    bigram_transformer = Phrases(TextCorpus(corpus))

    for myfile in corpus.fileids():
        try:
            chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]]
            dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]])

        except Exception as e:
            print 'Warning error in file:', myfile

    model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary)
    model.save(model_filename)
Example #37
0
    def fit(self, raw_documents, y=None):
        self.analyzer_func = self.build_analyzer()

        self.model = LsiModel.load(self.model_fn)

        if os.path.exists(self.model_fn + '.tfidf'):
            self.tfidf = TfidfModel.load(self.model_fn + '.tfidf')

        return self
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Example #39
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
def build_tfidf_model(data_directory, corpus_path, wiki_text_output_path, model_output_path, multiwords=True, druid_cutoff_score=0.3):

    stemmer = nltk.stem.PorterStemmer()
    tokenid_dictionary = corpora.Dictionary()

    if not exists(wiki_text_output_path):
        logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path)
        # Convert Wikipedia XML dump into .txt format
        wikidump2text.convert(corpus_path, wiki_text_output_path)
    else:
        logger.info('Found ', wiki_text_output_path, ' not converting from the raw bz2 file.')

    # Load Multiword Expressions as Dictionary
    stopwords_path = join(data_directory, 'stopwords_en.txt')
    
    if multiwords:
        druid_path = join(data_directory, 'druid_en.bz2')
        druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score)
        logger.info('Loaded Druid with cutoff' + str(druid_cutoff_score))
    else:
        druid_dict = None

    logger.info("Building tfidf model...")
    start_time = time.time()

    if multiwords:
        logger.info('Using druid_en.bz2 in  ' + data_directory + ' as multiword dictionary.')
        articles = TextCorpus(wiki_text_output_path, druid_dict, multiwords=True)  # a memory-friendly iterator
    else:
        logger.info('Using no multiword dicitionary, just single words')
        articles = TextCorpus(wiki_text_output_path, None, multiwords=False)
    
    tokenid_dictionary.add_documents(articles)


    model = TfidfModel(BowCorpus(wiki_text_output_path, druid_dict, tokenid_dictionary, multiwords=multiwords), id2word=tokenid_dictionary)
    model.save(model_output_path)

    logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
Example #42
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Example #43
0
    def construct_model(self, documents):
        logging.basicConfig(
            format='%(asctime)s:%(levelname)s:%(message)s',
            level=logging.INFO
        )

        logging.info("Obtaining word tokens")
        tokens = [self.get_tokens(document) for document in documents]
        # self.tf_idf_model = TfidfModel(tokens)

        logging.info("Constructing dictionary")
        self.dictionary = Dictionary(tokens)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
        self.dictionary.compactify()
        self.dictionary.save(self.dictionary_path)

        logging.info("Constructing TF-IDF model")
        self.tf_idf_model = TfidfModel(dictionary=self.dictionary)
        self.tf_idf_model.save(self.tf_idf_model_path)
Example #44
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Example #45
0
def main(train, model, dic):
    logging.basicConfig(level=logging.INFO)
    corpus = SentenceDocCorpus(train)
    tfidf = TfidfModel(corpus)
    tfidf.save(model)
    corpus.dictionary.save(dic)
Example #46
0
#print 'Saved dictionary'

print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models')
BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus

#log_entropy = LogEntropyModel(BOW_corpus)
#log_entropy.save('../models/logEntropy.model') #already provided
log_entropy = LogEntropyModel.load('../models/logEntropy.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
log_entropy[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

print('Creating Similarity Index')
logent_corpus = MmCorpus('../data/log_entropy_matrix')
num_feat = len(wiki.dictionary.keys())
index = Similarity('../data/logEntropyShards/logEntropySimilarity',
logent_corpus, num_features=num_feat)

index.save('../data/logEntropyShards/logEntropySimilarityIndex')
print('Saved Shards and similarity index')

print('Getting list of titles...')
    return os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')

corpus_dir = os.path.join(data_directory(), 'audio_transcripts')
model_filename = os.path.join(data_directory(), 'conversation.tfidf')

stemmer = nltk.stem.PorterStemmer()
corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$')  # a memory-friendly iterator
dictionary = corpora.Dictionary()

# Train bigram transformer
class TextCorpus(object):
    def __iter__(self):
        for file in corpus.fileids():
            yield [word.lower() for word in corpus.words(file)]

bigram_transformer = Phrases(TextCorpus())

for file in corpus.fileids():
    chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]]
    dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]])


class BowCorpus(object):
    def __iter__(self):
        for file in corpus.fileids():
            chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]]
            yield dictionary.doc2bow([stemmer.stem(chunk) for chunk in chunks])

model = TfidfModel(BowCorpus(), id2word=dictionary)
model.save(model_filename)
Example #48
0
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)

	# TRAINING

	# lsa model
	if not os.path.exists(f_lsa):
		lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
		lsa.save(f_lsa)

	# word2vec model
	class MyCorpus():
		def __iter__(self):
			for d in corpus.get_texts():
    elif not opts.scaling:
        scaling = None
    else:
        raise ValueError("Only tfidf scaling is supported")

    word_model = opts.word_model

    if word_model:
        logging.info("Building word model")
        corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit)
    else:
        corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    voc = Dictionary(corpus)
    voc.filter_extremes(no_below=cutoff)
    voc.compactify()

    bow_corpus = (voc.doc2bow(art) for art in corpus)

    tfidf = None

    if scaling == 'tfidf':
        tfidf = TfidfModel(bow_corpus)
        bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus)

    model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc)
    model.save(model_fn)

    if tfidf:
        tfidf.save(model_fn + '.tfidf')
Example #50
0
class TfidfVectorizer():
    """
    Transform text to tf-idf representation
    """

    def __init__(self):

        self.base_path = os.path.dirname(__file__)
        self.dictionary_path = os.path.join(self.base_path, "dictionary")
        self.tf_idf_model_path = os.path.join(self.base_path, "tfidf")

        self.stemmer = NepStemmer()
        self.tf_idf_model = None

    def get_tokens(self, document):
        if not self.stemmer:
            raise Exception("Stemmer not available")

        return self.stemmer.get_stems(document)

    def construct_model(self, documents):
        logging.basicConfig(
            format='%(asctime)s:%(levelname)s:%(message)s',
            level=logging.INFO
        )

        logging.info("Obtaining word tokens")
        tokens = [self.get_tokens(document) for document in documents]
        # self.tf_idf_model = TfidfModel(tokens)

        logging.info("Constructing dictionary")
        self.dictionary = Dictionary(tokens)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
        self.dictionary.compactify()
        self.dictionary.save(self.dictionary_path)

        logging.info("Constructing TF-IDF model")
        self.tf_idf_model = TfidfModel(dictionary=self.dictionary)
        self.tf_idf_model.save(self.tf_idf_model_path)

    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)

    def doc2vector(self, document):
        """ Returns the sparse tf-idf vector for given document """

        tokens = self.get_tokens(document)
        bag_of_words = self.dictionary.doc2bow(tokens)

        return (self.tf_idf_model[bag_of_words])

    def obtain_feature_vector(self, document):
        """
        Returns a single dense tf-idf vector for a given document
        """

        self.load_data()

        tf_idf_vector = matutils.sparse2full(
            self.doc2vector(document),
            self.no_of_features
        ).reshape(1, -1)

        return tf_idf_vector

    def obtain_feature_matrix(self, documents):
        """
        Returns the tf-idf dense matrix for the given documents
        """

        self.load_data()

        input_matrix_sparse = [
            self.doc2vector(x)
            for x in documents
        ]

        no_of_features = len(self.tf_idf_model.idfs)

        input_matrix = matutils.corpus2dense(
            input_matrix_sparse,
            no_of_features
        ).transpose()

        return input_matrix
    # Remove stop words (additional removal of common words used in spoken language)
    stop_ids = []
    with open(stop_words_file, 'r') as infile:
        for line in infile:
            try:
                stop_ids.append(wiki.dictionary.token2id[line.lower().strip()])
            except KeyError:
                continue
    wiki.dictionary.filter_tokens(bad_ids=stop_ids)

    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

# In[18]:

df.tokens.values[0:3]


# In[22]:

d = Dictionary.from_documents(df.tokens)
d


# In[20]:

tfidf = TfidfModel(d)


# *Hint-Hint:* `gensim` is sprinting this week at PyCon!

# In[24]:

get_ipython().magic(u'pinfo TfidfModel')


# In[26]:

TfidfModel(df.txt)


# In[27]:
Example #53
0
def scorer(model, dic):
    tfidf = TfidfModel.load(model)
    dictionary = Dictionary.load(dic)
    def score(words):
        return tfidf[dictionary.doc2bow(words)]
    return score
    if len(sys.argv) < 2:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0
    for doc_idx in range(0, len(similarity_index)):
        logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx]))
        rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64)
        fwd_doc = similarity_index.vector_by_id(doc_idx)
        for feature_id, val in enumerate(fwd_doc.toarray().flatten()):
            if val == 0: continue
            feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten()

class TextCorpus(object):
    def __init__(self, filename):
        self.corpus = codecs.open(filename, 'r', encoding='utf-8')

    def __iter__(self):
        # One line contains one wiki article.
        for line in self.corpus:
            ngrams = druid_dict.find_ngrams(line.lower().split())
            yield [stemmer.stem(token) for token in ngrams]

articles = TextCorpus(wiki_text_output_path)  # a memory-friendly iterator
dictionary.add_documents(articles)


class BowCorpus(object):
    def __init__(self, filename):
        self.corpus = codecs.open(filename, 'r', encoding='utf-8')

    def __iter__(self):
        for line in self.corpus:
            ngrams = druid_dict.find_ngrams(line.lower().split())
            stemmed_article = [stemmer.stem(token) for token in ngrams]
            yield dictionary.doc2bow(stemmed_article)

model = TfidfModel(BowCorpus(wiki_text_output_path), id2word=dictionary)
model.save(model_output_path)

logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
# What about the raw, unprocessed unicode tweet text itself?

# In[6]:

import gzip
with gzip.open(os.path.join(DATA_PATH, 'datetimes.csv.gz'), 'rb') as f:
    nums = pd.read_csv(f, engine='python', encoding='utf-8')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    corpus = pd.DataFrame.from_csv(f, encoding='utf8')


# Now load previously compiled vocabulary and TFIDF matrix (transformation)

# In[11]:

tfidf = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))
tfidf.num_docs


# In[17]:

bows = pd.Series(vocab.doc2bow(toks) for toks in corpus.tokens)
bows


# This would make a nice, compact sparse matrix representation of our entire corpus...  
# Which would mean we could do more in RAM at once.  
# Left as an exercise.  (check out `scipy.sparse.coo_matrix`)  

# In[18]:
Example #57
0
        self.dictionary.filter_extremes(no_below=1, keep_n=5000)  # check API docs for pruning params
        self.dictionary.save_as_text("wiki_en_wordids.txt")

    def __iter__(self):
        for tokens in iter_documents():
            yield self.dictionary.doc2bow(tokens)


corpus = MyCorpus()  # create a dictionary
corpora.MmCorpus.serialize("wiki_en_corpus.mm", corpus)  # store to disk, for later use

# for vector in corpus: # convert each document to a bag-of-word vector
#    print vector

print "Create models"
tfidf_model = TfidfModel(corpus)
tfidf_model.save("wiki_en_tfidf.model")

# lsi_model = LsiModel(corpus)

# topic_id = 0
# for topic in lsi_model.show_topics():
#    topic_id+=1
#    print "TOPIC (LSI) " + str(topic_id) + " : " + topic

# lsi_model.print_topic(20, topn=10)
# corpus_lsi = lsi_model[corpus]

corpus_tfidf = tfidf_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
Example #58
0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program
    wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
    # only keep the most frequent words (out of total ~8.2m unique tokens)
    wiki.dictionary.filter_extremes(no_below=20, keep_n=DEFAULT_DICT_SIZE)
    wiki.dictionary.save_as_text(outp + '_wordids.txt')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt')

    # build tfidf, ~50min
    tfidf = TfidfModel(wiki, normalize=True)
    tfidf.save('tfidf_all_words')



    logger.info("finished running %s" % program)
Example #59
0
    if len(sys.argv) < 3:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")

    logger.info("Processing input documents...")

    try:
        infile = open(input_file, 'r')
    except IOError:
        print('cannot open %s' % (input_file,))
        sys.exit(1)