Exemple #1
0
def get_tfidf_model():
    if os.path.isfile(TFIDF_FILE):
        return TfidfModel.load(TFIDF_FILE)
    else:
        model = TfidfModel(get_corpus(), get_dictionary())
        model.save(TFIDF_FILE)
        return model
class TFIDFmodel(object):
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora'])
        self.dictionary = corpora.Dictionary.load(os.path.join(corpora_folder, "%s.dict" % (vocabulary,)))
        self.corpus = corpora.MmCorpus(os.path.join(corpora_folder, "%s.mm" % (vocabulary,)))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)

    def __contains__(self, item):
        return item in self.inner_model
Exemple #3
0
def cal_tfidf(documents, topk=10) -> List:
    """
    tfidf模型训练
    :param documents: 要进行训练的文档
    :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词
    :return:
    """
    # 单个文档分成列表
    docs = [[word for word in document.split(' ')] for document in documents]
    # 生成字典
    dictionary = corpora.Dictionary(docs)
    # 生成bag of word
    docs_bow = [dictionary.doc2bow(doc) for doc in docs]
    if os.path.isfile(tfidfmodel):
        model = TfidfModel.load(tfidfmodel)
    else:
        model = TfidfModel(docs_bow)
        model.save(tfidfmodel)
    # 生成文本向量
    docs_vector = list(model[docs_bow])
    # 对所有的文本向量进行排序,取钱topk
    docs_sort_vector = [
        sorted(doc, key=lambda x: x[1], reverse=True)[:topk]
        for doc in docs_vector
    ]
    # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表
    docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc]
                         for doc in docs_sort_vector]
    return docs_sort_chinese
Exemple #4
0
def buildTfidfModel(corpus):
    print('get tfidf model...')
    if not os.path.exists(modelpath + 'tfidf.model'):
        # 构造tfidf向量
        tfidf = TfidfModel(corpus)
        tfidf.save(modelpath + 'tfidf.model')
    else:
        tfidf = TfidfModel.load(modelpath + 'tfidf.model')
    print('done')
    return tfidf
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, dirpath=".", tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel(
            [self.lexicon.doc2bow(doc) for doc in documents],
            id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())
Exemple #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-o', '--output-file')
    args = parser.parse_args()

    encoding = args.encoding
    output_fn = args.output_file

    if not output_fn:
        sys.exit(-1)

    if encoding:
        sys.stdout = codecs.getwriter(encoding)(sys.stdout)
        sys.stdin = codecs.getreader(encoding)(sys.stdin)

    texts = (line.split() for line in sys.stdin)

    logging.info('Creating vocabulary ...')
    vocab = Dictionary(texts)

    logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2'))
    vocab.save(output_fn)

    logging.info('Compressing vocabulary ...')

    with open(output_fn, 'rb') as input:
        with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn)

    logging.info('Creating IDF model ...')
    tfidf = TfidfModel(dictionary=vocab)

    logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2'))
    tfidf.save(output_fn + '.tfidf')

    logging.info('Compressing IDF model ...')

    with open(output_fn + '.tfidf', 'rb') as input:
        with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb',
                         compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn + '.tfidf')
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, dirpath=".", tofull=False):
        """
        Gensim vectorizer
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):
        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        if self.lexicon == None or self.tfidf == None:
            inputDocuments = list(documents)
            self.lexicon = Dictionary(inputDocuments)
            self.tfidf = TfidfModel(
                [self.lexicon.doc2bow(doc) for doc in inputDocuments],
                id2word=self.lexicon)
            self.save()
            return self
        else:
            return self

    def transform(self, documents):
        returnDocs = []
        for document in documents:
            vec = self.tfidf[self.lexicon.doc2bow(document)]
            if self.tofull:
                returnDocs.append(sparse2full(vec))
            else:
                returnDocs.append(vec)
        return returnDocs
Exemple #8
0
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, dirpath=".", tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec
        return list(generator())
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-o', '--output-file')
    args = parser.parse_args()

    encoding = args.encoding
    output_fn = args.output_file

    if not output_fn:
        sys.exit(-1)

    if encoding:
        sys.stdout = codecs.getwriter(encoding)(sys.stdout)
        sys.stdin = codecs.getreader(encoding)(sys.stdin)

    texts = (line.split() for line in sys.stdin)

    logging.info('Creating vocabulary ...')
    vocab = Dictionary(texts)

    logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2'))
    vocab.save(output_fn)

    logging.info('Compressing vocabulary ...')

    with open(output_fn, 'rb') as input:
        with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn)

    logging.info('Creating IDF model ...')
    tfidf = TfidfModel(dictionary=vocab)

    logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2'))
    tfidf.save(output_fn + '.tfidf')

    logging.info('Compressing IDF model ...')

    with open(output_fn + '.tfidf', 'rb') as input:
        with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn + '.tfidf')
Exemple #10
0
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, dirpath=".", tofull=False):
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")
        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel(
            [self.lexicon.doc2bow(doc) for doc in documents],
            id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())
Exemple #11
0
class TFIDFmodel(object):
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'corpora'
        ])
        self.dictionary = corpora.Dictionary.load(
            os.path.join(corpora_folder, "%s.dict" % (vocabulary, )))
        self.corpus = corpora.MmCorpus(
            os.path.join(corpora_folder, "%s.mm" % (vocabulary, )))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'models'
        ])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)

    def __contains__(self, item):
        return item in self.inner_model
Exemple #12
0
def getToipc(file_name, toipc_type="lda", topics_num=5, topics_words=5):
    """
    生成主题模型
    :param file_name:
    :param toipc_type: lda or lsi
    :param topics_num:
    :param topic_words:
    :return:
    """
    texts = list()
    f = codecs.open(file_name, 'r', encoding='utf-8')
    for line in f:
        tt_texts = list()
        line = line.strip()
        words = jieba.cut(line, cut_all=False)
        t_texts = list(words)
        for text in t_texts:
            if len(text.strip()) > 1:
                tt_texts.append(text)
        texts.append(tt_texts)
    # print(texts)

    # 去掉只出现一次的单词
    frequency = defaultdict(int)
    """for text in texts:
        for token in text:
        
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1] for text in texts]"""

    dictionary = corpora.Dictionary(texts)  # 生成词典# -*- coding: utf-8 -*-

    # 将文档存入字典,字典有很多功能,比如
    # diction.token2id 存放的是单词-id key-value对
    # diction.dfs 存放的是单词的出现频率
    dictionary.save(
        'deerwester.dict')  # store the dictionary, for future reference
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('deerwester.mm',
                               corpus)  # store to disk, for later use
    tfidf = TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus]
    #print(tfidf.idfs)
    tfidf.save('foo.tfidf_model')
    """
    加载模型
    dictionary = corpora.Dictionary.load('mydict.dic')
    corpus = corpora.MmCorpus('lsi_corpus.mm')
    model = LsiModel.load('model.lsi')
    model2 = LdaModel.load('model.lda')
    TfidfModel.load(foo.tfidf_model)"""

    # print('aaaaaaaaaaaaaaaa')

    if toipc_type == "lsi":
        #lsi
        #lsi = LsiModel(corpus = tfidf_corpus,id2word=dictionary,num_topics=2)
        lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary)
        lsi_corpus = lsi[tfidf_corpus]
        lsi.save('model.lsi')
        corpora.MmCorpus.serialize('lsi_corpus.mm', lsi_corpus)
        #print 'LSI Topics:'
        #print json.dumps(lsi.print_topics(num_topics=topics_num,num_words=topics_words), encoding="UTF-8", ensure_ascii=False)
        return lsi.print_topics(num_topics=topics_num, num_words=topics_words)

    if toipc_type == "lda":
        #lda
        #lda = LdaModel(corpus = tfidf_corpus,id2word=dictionary,num_topics=1)
        lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary)
        lda_corpus = lda[tfidf_corpus]
        lda.save('model.lda')
        corpora.MmCorpus.serialize('lda_corpus.mm', lda_corpus)
        #print 'LDA Topics:'
        #print json.dumps(lda.print_topics(num_topics=topics_num,num_words=topics_words), encoding="UTF-8", ensure_ascii=False)
        return lda.print_topics(num_topics=topics_num, num_words=topics_words)
class TextProcessor:
    def __init__(self, n_users, n_samples, n_dims):
        self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims
        self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None

        self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\
            conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \
            conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt')

    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model

    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus

    def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus

    def lda_transform(self,
                      corpus_tf_idf,
                      train_separated=False,
                      is_update=False):
        """
        Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it.
        :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix.
        :param train_separated: The model is going to be train with all corpus one time or some of them separately one time.
        :param is_update: Whether the training to be perform is to construct a new model or update one existed.
        :return: lda corpus.
        """
        logger.info('Training lda model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        if is_update:
            # A ldaModel had been trained before and now update the model with other corpus.
            if self.ldaModel is None:
                self.load_model('lda')
            self.ldaModel.update(corpus_tf_idf)
            logger.info('Lda model has been updated successfully.')
            return self.ldaModel[corpus_tf_idf]

        if train_separated:
            # corpus = []
            # spacing = 10000
            # for i in range(int(len(corpus_tf_idf)/spacing)):
            #     corpus.append(corpus_tf_idf[i*spacing: i])
            # self.ldaModel = LdaModel()
            pass

        self.ldaModel = LdaModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)

        conf.mk_dir(self.ldaPath)
        self.ldaModel.save(self.ldaPath)
        logger.info('lda model has been saved in %s' % self.ldaPath)

        lda_corpus = self.ldaModel[corpus_tf_idf]
        lda_corpus_path = conf.get_filename_via_tpl('lda',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lda_corpus_path)
        corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus)
        logger.info('Lda corpus with a shape of %s has been saved in %s.' %
                    (np.array(lda_corpus).shape, lda_corpus_path))

        return lda_corpus

    def w2v_transform(self, sentences):
        """
        Perform word2vec on texts and obtain a w2v model.
        :param sentences: Sentences that each one of it contains a list of words of a text.
        :return: W2v model.
        """
        logger.info('Training w2v model with a dim of %d...' % self.nDims)
        # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path)
        # sentences = []
        # for sen in file.readlines():
        #     sentences.append(sen.strip().split(' '))
        # print(sentences)
        self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0)

        conf.mk_dir(self.w2vPath)
        self.w2vModel.save(self.w2vPath)
        self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False)
        # print(model['['])

        # Construct w2v corpus
        w2v_corpus = []
        for sen in sentences:
            vec = [0] * self.nDims
            if len(sen) > 0:
                for word in sen:
                    vec = list(
                        map(lambda m, n: m + n, vec, self.w2vModel[word]))
                    # vec += self.w2vModel[word]
            w2v_corpus.append(vec)

        w2v_corpus_path = conf.get_filename_via_tpl('w2v',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims)
        conf.mk_dir(w2v_corpus_path)

        with open(w2v_corpus_path, 'w') as fp:
            csv_writer = csv.writer(fp)
            for line in w2v_corpus:
                csv_writer.writerow(line)
        logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path)

        return w2v_corpus

    def load_corpus(self, model_type, dense=False):
        corpus = None
        try:
            if model_type == 'tfidf':
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl('tfidf',
                                              n_users=self.nUsers,
                                              postfix='mm',
                                              n_samples=self.nSamples))
            elif model_type in ['lsi', 'lda']:
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl(model_type,
                                              n_users=self.nUsers,
                                              n_samples=self.nSamples,
                                              n_dims=self.nDims,
                                              postfix='mm'))
            elif model_type == 'w2v':
                corpus = np.loadtxt(conf.get_filename_via_tpl(
                    model_type,
                    n_users=self.nUsers,
                    n_samples=self.nSamples,
                    n_dims=self.nDims),
                                    dtype=np.float,
                                    delimiter=',')

            logger.info('%s corpus with a shape of %s has been loaded. ' %
                        (model_type, np.array(corpus).shape))

            if dense and model_type in ['tfidf', 'lsi', 'lda']:
                corpus = matutils.corpus2dense(corpus,
                                               self.nDims,
                                               self.nSamples * self.nUsers,
                                               dtype=np.float).T
            else:
                corpus = np.array(corpus)
        except Exception as e:
            raise e
        return corpus

    @staticmethod
    def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float):
        return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T

    def load_vec(self, vec_type):
        logger.info('Loading %s vectors...' % vec_type)
        try:
            corpus_vec = self.load_corpus(vec_type, True)
        except Exception as e:
            raise e
        data = []
        for i in range(self.nUsers):
            data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples])
        data = np.array(data, dtype=np.float)
        return data
Exemple #14
0
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, dirpath=".", type='tfidf', tofull=False, vec_size=100):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).
        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._type = type
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._model_path = os.path.join(dirpath, type + ".model")
        self.lexicon = None
        self.model = None
        self.tofull = tofull
        self._nfeat = vec_size

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._model_path):
            self.model = TfidfModel().load(self._model_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.model.save(self._model_path)

    def fit(self, documents, labels=None):
        if self._type == "tfidf":
            self.lexicon = Dictionary(documents)
            self.model = TfidfModel(
                [self.lexicon.doc2bow(doc) for doc in documents],
                id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        if self._type == "doc2vec":
            taggeddoc = [
                TaggedDocument(words, ['d{}'.format(idx)])
                for idx, words in enumerate(documents)
            ]
            model = Doc2Vec(taggeddoc,
                            vector_size=self._nfeat,
                            window=2,
                            min_count=1,
                            workers=4)
            docvec_mat = self.model.docvecs.vectors_docs
        else:
            if self._type == "count":
                docvecs = [
                    self.lexicon.doc2bow(document) for document in documents
                ]
            elif self._type == "ohe":
                docvecs = [[(token[0], 1)
                            for token in self.lexicon.doc2bow(document)]
                           for document in documents]
            else:
                docvecs = [
                    self.tfidf[self.lexicon.doc2bow(document)]
                    for document in documents
                ]
            docvecs = [
                sparse2full(docvec, len(self.lexicon)) for docvec in docvecs
            ]
            docvec_mat = sp.csr_matrix(docvecs, dtype=np.float64)
        return docvec_mat
Exemple #15
0
class JsonCorpus(object):
    def __iter__(self):
        data = json.load(open('data/nasa.json'))

        desc = [
            TextBlob(dataset['description'].lower()).tokens
            for dataset in data['dataset']
        ]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('tfidf.pkl')
        dictionary = Dictionary.load('nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save('nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)
Exemple #16
0
 def gerar_modelo(self, modelo):
     '''
     Treina o modelo selecionado, salvando-o. Após, cria a matrix de similaridade para o corpus transformado.
     Parâmetros:
         modelo (str) --> nome do modelo: "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec"
     Retorno: None
     '''
     # Verifica se o modelo foi implementado
     if modelo not in self._modelos:
         print(f'O modelo "{modelo}" não foi implementado.')
         return
     # Define os nomes dos arquivos
     arq_model = os.path.join(
         self.corpus._pastas['modelos'],
         f'{self.corpus._link_nome}.{self._exts[modelo]}')
     arq_index = os.path.join(self.corpus._pastas['indices'],
                              f'{self.corpus._link_nome}_{modelo}.idx')
     # Gera o modelo solicitado
     if modelo == 'tfidf':
         # Inicializa o modelo
         corpus_train = self.corpus.corpus(tipo='bow')
         num_features = self.corpus.num_tokens
         model = TfidfModel(corpus=corpus_train,
                            id2word=self.corpus.dicionario())
     elif modelo == 'tfidf_pivot':
         # Inicializa o modelo
         corpus_train = self.corpus.corpus(tipo='bow')
         num_features = self.corpus.num_tokens
         model = TfidfModel(corpus=corpus_train,
                            id2word=self.corpus.dicionario(),
                            smartirs='nfu',
                            pivot=self.corpus.num_tokens /
                            self.corpus.num_docs)
     elif modelo == 'lda':
         # Inicializa o modelo
         corpus_train = self.corpus.corpus(tipo='bow')
         num_features = self._modelos[modelo]['num_topics']
         model = LdaModel(corpus=corpus_train,
                          id2word=self.corpus.dicionario(),
                          num_topics=num_features)
     elif modelo == 'lsi':
         # Inicia o modelo
         corpus_train = self.corpus.corpus(tipo='tfidf')
         num_features = self._modelos[modelo]['num_topics']
         model = LsiModel(corpus=corpus_train,
                          id2word=self.corpus.dicionario(),
                          num_topics=num_features)
     elif modelo == 'doc2vec':
         # Instancia o modelo Doc2Vec
         corpus_train = self.corpus.corpus(tipo='tagged')
         num_features = self._modelos[modelo]['vector_size']
         model = Doc2Vec(vector_size=num_features,
                         workers=mp.cpu_count() / 2,
                         alpha=self._modelos[modelo]['alpha'],
                         min_alpha=self._modelos[modelo]['min_alpha'])
         # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec
         model.build_vocab(corpus_train)
         # Treina o modelo Doc2Vec
         model.train(corpus_train,
                     total_examples=model.corpus_count,
                     epochs=model.epochs)
     else:
         print(f'O modelo "{modelo}" não foi implementado.')
         return
     # Salva o modelo treinado
     model.save(self._arqs['modelos'][modelo])
     # Define o corpus para a matriz de similaridade
     if modelo == 'doc2vec': corpus = Doc2VecCorpus(model)
     else: corpus = model[corpus_train]
     # Gera o index a partir do modelo serializado
     index = Similarity(output_prefix=self._arqs['indices'][modelo],
                        corpus=corpus,
                        num_features=num_features)
     # Salva o índice
     index.save(self._arqs['indices'][modelo])
        comments_dictionary = Dictionary(docs)
        comments_dictionary.filter_extremes(no_below=10, no_above=0.3)
        comments_dictionary.compactify()
        comments_dictionary.save(FLAGS.dictFile)
    else:
        print("Loading dictionary...")
        comments_dictionary = Dictionary.load(FLAGS.dictFile)

    print("Converting to BOW vectors...")
    comments_corpus = [comments_dictionary.doc2bow(d) for d in docs]

    model_tfidf = None
    if doTrain:
        print("Creating tfidf model...")
        model_tfidf = TfidfModel(comments_corpus)
        model_tfidf.save(FLAGS.tfidfFile)
    else:
        print("Loading tfidf model...")
        model_tfidf = TfidfModel.load(FLAGS.tfidfFile)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
    comments_vecs = np.vstack(
        [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf])

    chi2_features = None
    if doTrain:
        # Find most descrimitive words for any of the labels
        print("Finding discrimitive features...")
        labels = np.array(data['any'])
        model_fpr = SelectFpr(chi2, alpha=0.025)
Exemple #18
0
from gensim.corpora import Dictionary


class JsonCorpus(object):
    def __iter__(self):
        data = json.load(open('data/nasa.json'))

        desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('tfidf.pkl')
        dictionary = Dictionary.load('nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save('nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)
Exemple #19
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name'])
    human_data_file = path.join(base_path, p['human_data_file'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(base_path,
                                           p['corpus_path'],
                                           p['dict_name']))
    Dictionary.save(dictionary, path.join(output_dir, p['dict_name']))
    logger.info(dictionary)

    logger.info('loading corpus')
    corpus_bow = MmCorpus(working_corpus)

    logger.info("create preprocessing model and save it to disk")
    if p['pre_model'] == 'tfidf':
        pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    elif p['pre_model'] == 'log_ent':
        pre_model = LogEntropyModel(corpus_bow,
                                    id2word=dictionary, normalize=True)
    else:
        raise ValueError('model parameter %s not known' % p['pre_model'])
    pre_model.save(os.path.join(output_dir, p['pre_model_extension']))

    logger.info('initialize LSI model')
    lsi = models.LsiModel(pre_model[corpus_bow],
                          id2word=dictionary, num_topics=p['num_topics'])
    lsi.save(os.path.join(output_dir, p['lsi_extension']))
    logger.info('finished --> lsi model saved to: %s' %
                os.path.join(output_dir, p['lsi_extension']))

    # check for correlation with lee human data
    logger.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (LSI)')
    corpus_lsi = lsi[pre_model[bow_lee_texts]]

    # # compute pairwise similarity matrix of transformed corpus
    sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
    for i, par1 in enumerate(corpus_lsi):
        for j, par2 in enumerate(corpus_lsi):
            sim_matrix[i, j] = matutils.cossim(par1, par2)
    sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(human_data_file)
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    # compute correlations
    cor = np.corrcoef(sim_vector, human_sim_vector)
    logger.info("correlation with lee human data: %f" % cor[0, 1])

    dif = start - datetime.now()
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def prepare_text_for_fitting(full_texts, sentences, nlp, **kwargs):
    #Grap and parse the chapters/sentences from the input corpus
    chapters = full_texts.split('\n\n\n\n\n\n')
    p_chapters = [
        tokenize(nlp(chapter_return(chapter))) for chapter in chapters
    ]
    p_sentences = [tokenize(nlp(sentence)) for sentence in sentences]
    #Create gensim dictionaries and carefully filter the high/low occurring words.
    text_dict = Dictionary(p_chapters)
    sentence_dict = Dictionary(p_sentences)
    text_dict.filter_extremes(no_below=4, no_above=0.22)
    print len(text_dict)
    text_dict.compactify()
    text_dict[text_dict.keys()[0]]
    #Get the bag of word representation for every word in each chapter
    chap_corpus = [text_dict.doc2bow(c) for c in p_chapters]
    #sent_corpus = [text_dict.doc2bow(s) for s in p_sentences]
    #The GloVe vector representation of each word in all of the chapters
    tf_idf_glove = np.vstack(
        [nlp(text_dict[i]).vector for i in range(len(text_dict))])
    #Create a normed set of the vectors for easy similarity scoring
    normed_vecs = copy.deepcopy(tf_idf_glove)
    for i, nv in enumerate(normed_vecs):
        normed_vecs[i] = nv / np.linalg.norm(nv)
    #Get the bag of word rep. for each applicable sentence.
    #If a word is not in the dictionary, we grab and weight the most similar available word.
    sent_corpus = [
        get_sent_bow(s, text_dict, nlp, preload=normed_vecs)
        for s in p_sentences
    ]
    #pickle.dump(sent_corpus,open('raw_count_mat.pckl','wb'))
    #Could use atn or ntn as well as ltn
    if os.path.isfile('tf_idf_sent_mat_samp4.pckl'):
        sent_vecs = pickle.load(open('tf_idf_sent_mat_samp4.pckl', 'rb'))
    else:
        #Create a TF-IDF model for the text as a whole
        model_tfidf = TfidfModel(chap_corpus,
                                 id2word=text_dict,
                                 smartirs='ltn')
        model_tfidf.save('tfidf_model_samp4')
        #Apply the model to each word in the applicable sentences
        sent_tfidf = model_tfidf[sent_corpus]
        #Unpack each TF-IDF vector
        sent_vecs = np.vstack(
            [sparse2full(c, len(text_dict)) for c in sent_tfidf])
        pickle.dump(sent_vecs, open('tf_idf_sent_mat_samp4.pckl', 'wb'))

    if os.path.isfile('glove_sent_mat_samp4.pckl'):
        sent_glove_mat = pickle.load(open('glove_sent_mat_samp4.pckl', 'rb'))
    else:
        #Weight the glove vector representation by the appropriate TF-IDF values
        sent_glove_mat = np.dot(sent_vecs, tf_idf_glove)
        pickle.dump(sent_glove_mat, open('glove_sent_mat_samp4.pckl', 'wb'))
    if os.path.isfile('sent_w2v_mat_samp4.pckl'):
        sent_w2v_mat = pickle.load(open('sent_w2v_mat_samp4.pckl', 'rb'))
    else:
        #Create a 250 element Word2Vec modeller
        model_w2v = Word2Vec(p_chapters, size=250, window=7)
        #Train it over 10 epochs
        model_w2v.train(p_chapters,
                        total_examples=model_w2v.corpus_count,
                        epochs=10)
        model_w2v.init_sims()
        model_w2v.save('word2vec_model_samp4')

        #Fix non-included ones
        ids = []
        #Collect the dict. ID's for the intersection of the w2v and text vocabs.
        for k in model_w2v.wv.vocab:
            try:
                ids.append(text_dict.token2id[k])
            except KeyError:
                pass
        #[text_dict.token2id[k] for k in model_w2v.wv.vocab]
        #Create the new, smaller subset dictionary
        filt_dict = {new_id: text_dict[new_id] for new_id in ids}
        #Deal with the id numbers being off.
        blah = zip(list(np.sort(ids)), range(len(model_w2v.wv.vocab)))
        renum_dict = dict(blah)
        #Subset corpus
        filt_sent_corp = []
        for i in range(len(p_sentences)):
            corp_ = []
            for p in sent_corpus[i]:
                if p[0] in ids:
                    corp_.append((renum_dict[p[0]], p[1]))
            filt_sent_corp.append(corp_)
        #New, smaller Word2Vec model
        tdidf_w2v = TfidfModel(filt_sent_corp,
                               id2word=filt_dict,
                               smartirs='ltn')
        sent_w2v_tdidf = tdidf_w2v[filt_sent_corp]
        #Appropriate TF-IDF vectors
        w2v_tfidf_vecs = np.vstack(
            [sparse2full(c, len(filt_dict)) for c in sent_w2v_tdidf])

        #Collect all of the appropriate Word2Vectors
        w2v_vecs = [
            model_w2v.wv[filt_dict[filt_dict.keys()[i]]]
            for i in range(len(filt_dict))
        ]
        w2v_vecs = np.array(w2v_vecs)
        w2v_vecs.shape = (len(filt_dict), 250)

        sent_w2v_mat = np.dot(w2v_tfidf_vecs, w2v_vecs)
        pickle.dump(sent_w2v_mat, open('w2v_sent_mat_samp4.pckl', 'wb'))

    return sent_vecs, sent_glove_mat, sent_w2v_mat
Exemple #21
0
    def construct_tfidf_model(self, model_path):
        model = TfidfModel(self.corpus)
        model.save(model_path)

        return model
Exemple #22
0
    def __iter__(self):
        data = json.load(open('../data/nasa.json'))

        desc = [
            TextBlob(dataset['description'].lower()).tokens
            for dataset in data['dataset']
        ]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('../data/tfidf.pkl') and os.path.exists(
            '../data/nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('../data/tfidf.pkl')
        dictionary = Dictionary.load('../data/nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save(self, '../data/nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('../data/tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)
dictionary = corpora.Dictionary(
    processed_docs)  # create a dictionary of words from our keywords
dictionary.save(path + 'dim_items_terms.dict')  #saved

# Creating and saving corpus
corpus = [
    dictionary.doc2bow(doc) for doc in processed_docs
]  #create corpus where the corpus is a bag of words for each document
corpora.MmCorpus.serialize(path + 'dim_items_terms.mm', corpus)  #saved

#---Creating TFIDF MATRIX

tfidf = TfidfModel(
    corpus
)  # # step 1 -- initialize a model i.e. create tfidf model of the corpus (train the transformation model)
tfidfmodelsave = tfidf.save(path + 'dim_items_terms.tfidf')  #saved
tfidf_corpus = tfidf[corpus]

#checking allocations of weights from tfidf matrix------
sorted_tfidf_weights = sorted(tfidf[corpus[0]],
                              key=lambda w: w[1],
                              reverse=True)
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

#finding cosine similarity from tfidf matrix------
sims = Similarity('path1', tfidf[corpus], num_features=len(dictionary))
sims.save(path + '_saved_sims.similarity')

app = Flask(__name__)
Exemple #24
0
    def construct_tfidf_model(self, model_path):
        model = TfidfModel(self.corpus)
        model.save(model_path)

        return model