Exemple #1
0
class TfIdfGloveTransformer():
    '''
    word_embedder is pretrained gensim.KeyedVectors model
    
    dim is the dimension on word_embedder
    '''
    def __init__(self, word_embedder, dim=300):
        self.word_embedder = word_embedder
        self.dim = dim
        self.word_dict = None
        self.bows = None
        self.tfidf = None
        self.token2id = None

    '''
    Fits from corpus of tokenized documents.
    '''

    def fit(self, X, y=None):
        self.word_dict = corpora.Dictionary(X, prune_at=None)
        self.bows = [self.word_dict.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.bows, normalize=True)
        self.token2id = self.word_dict.token2id
        return self

    '''
    returns embedding representation of documents in X
    '''

    def transform(self, X):
        new_bows = [self.word_dict.doc2bow(doc) for doc in X]
        result = np.zeros((len(X), self.dim))
        # perhaps this can be implemented better in a vectorial way
        for i, (doc, bow) in enumerate(zip(X, new_bows)):
            score_hash = {
                tup[0]: tup[1]
                for tup in self.tfidf.__getitem__(bow, -1)
            }  # threshold
            weighted_embeddings = np.array([
                np.dot(self.word_embedder[word],
                       score_hash[self.token2id[word]])
                if word in self.word_embedder else np.zeros((1, self.dim))
                for word in doc
            ])
            result[i] = np.sum(weighted_embeddings, axis=0)
        return result
Exemple #2
0
class TFIDF(DocTerm):
    def __init__(self,
                 preprocessor='stopwords~links+hashtags+mentions+lowercase',
                 vocab_size=10000,
                 max_df=0.5,
                 min_df=5,
                 **kwargs):
        super().__init__(preprocessor=preprocessor,
                         vocab_size=vocab_size,
                         max_df=max_df,
                         min_df=min_df)

        # TODO: make tfidf args explicit
        self.__dict__.update(kwargs)

        self.tfidf_model = None

    def fit(self, corpus):

        docterm = super().fit(corpus)

        self.tfidf_model = TfidfModel(self.corpus_as_bow, id2word=self.vocab)
        self.corpus_as_tfidf = self.transform(corpus)
        return self

    def is_fit(self):
        return super().is_fit() and self.tfidf_model is not None

    def transform(self, corpus):

        if not self.is_fit():
            self.fit(corpus)

        tokens = self.preprocessor.transform(corpus)
        bow = [self.vocab.doc2bow(doc) for doc in tokens]
        docs = [self.tfidf_model.__getitem__(doc) for doc in bow]

        return corpus2csc(docs, num_terms=len(self.vocab)).T

    def _make_main_table_elements(self):
        data, title = super()._make_main_table_elements()
        data['Weighting'] = ['TF-IDF']

        return data, title
Exemple #3
0
class LMDL_VSM():
    def __init__(self):
        self.corpus = LMDL_Corpus()
        self.mapping = Dictionary(self.corpus.get_processed_documents())
        self.bow_list = self._corpus_bow()
        self.inverted_index = self._inverted_index_matrix()
        self.tfidf_model = TfidfModel(self.bow_list,
                                      dictionary=self.mapping,
                                      id2word=self.mapping.token2id)
        self.log_inverted_index = log1p(self.inverted_index.tocsc())

    def _corpus_bow(self):
        sorted_filename = self.corpus.get_filenames()
        ordered_bow = []
        for filename in sorted_filename:
            dt = self.corpus.document_terms(filename)
            ordered_bow.append(self.mapping.doc2bow(dt))
        return ordered_bow

    def _inverted_index_matrix(self):
        # term-document sparse matrix with TF weight
        sparse_matrix = corpus2csc(corpus=self.bow_list,
                                   num_terms=self.corpus.vocabulary_size(),
                                   num_docs=self.corpus.number_of_documents())
        return sparse_matrix.tocsr()

    def verbose_inverted_sparse_index(self):
        inverted_index = {}
        filenames = self.corpus.get_filenames()
        for term in self.corpus.get_vocabulary():
            if term in self.mapping.token2id.keys():
                token_id = self.mapping.token2id[term]
                token_row = self.inverted_index.getrow(
                    token_id).toarray().tolist()
                inverted_index[term] = list(zip(filenames, token_row[0]))
        return inverted_index

    def verbose_inverted_index(self):
        inverted_index = {}
        filenames = self.corpus.get_filenames()
        for term in self.corpus.get_vocabulary():
            if term in self.mapping.token2id.keys():
                token_id = self.mapping.token2id[term]
                token_row = self.inverted_index.getrow(
                    token_id).toarray().tolist()
                sparse_zip = list(zip(filenames, token_row[0]))
                no_zero = [(ubication, weight)
                           for (ubication, weight) in sparse_zip if weight > 0]
                inverted_index[term] = no_zero
        return inverted_index

    def documents_tfidf(self):
        filenames = self.corpus.get_filenames()
        for file in filenames:
            token_col = self.inverted_index.getcol(
                filenames.index(file)).toarray().tolist()
            sparse_bow = list(
                zip(list(range(0, self.corpus.vocabulary_size())), token_col))
            sparse_bow = [(ubication, weight)
                          for (ubication, [weight]) in sparse_bow]
            vector_list = self.tfidf_model.__getitem__(sparse_bow, eps=-1)
            vector_str = [str(weight) for (ubication, weight) in vector_list]
            export_to_txt('tfidf\\' + file.replace('html', 'txt'),
                          ' '.join(vector_str))

    def documents_log1p(self):
        filenames = self.corpus.get_filenames()
        for file in filenames:
            vector_list = self.log_inverted_index.getcol(
                filenames.index(file)).toarray().tolist()
            vector_str = [str(element) for [element] in vector_list]
            export_to_txt('locallog\\' + file.replace('html', 'txt'),
                          ' '.join(vector_str))