Esempio n. 1
0
 def __init__(self, embedding, **kwargs):
     """TODO: to be defined1. """
     # list of words in the embedding
     vocabulary = embedding.index2word
     self.embedding = embedding
     print("Embedding shape:", embedding.syn0.shape)
     TfidfVectorizer.__init__(self, vocabulary=vocabulary, **kwargs)
 def __init__(self, corpus, model, n, epoch, token_pattern, ngram_range):
     TfidfVectorizer.__init__(self)
     self.token_pattern = token_pattern
     self.ngram_range = ngram_range
     self._corpus = corpus
     self._model = model
     self._n = n
     self._epoch = epoch
    def __init__(
        self,
        data_frame: pd.DataFrame,
        description_col: str,
        name_col: str,
        **kwarg,
    ) -> None:
        self.name_col = name_col
        self.description_col = description_col

        TfidfVectorizer.__init__(self, **kwarg)
        ItemBasedRS.__init__(self, base_data_frame=data_frame)
Esempio n. 4
0
 def __init__(self, embedding, **kwargs):
     """TODO: to be defined1. """
     # list of words in the embedding
     if not hasattr(embedding, 'index2word'):
         raise ValueError("No `index2word` attribute found."
                          " Supply the word vectors (`.wv`) instead.")
     if not hasattr(embedding, 'vectors'):
         raise ValueError("No `vectors` attribute found."
                          " Supply the word vectors (`.wv`) instead.")
     vocabulary = embedding.index2word
     self.embedding = embedding
     print("Embedding shape:", embedding.vectors.shape)
     TfidfVectorizer.__init__(self, vocabulary=vocabulary, **kwargs)
Esempio n. 5
0
 def __init__(self, embedding, **kwargs):
     """TODO: to be defined1. """
     # list of words in the embedding
     if not hasattr(embedding, 'index2word'):
         raise ValueError("No `index2word` attribute found."
                          " Supply the word vectors (`.wv`) instead.")
     if not hasattr(embedding, 'vectors'):
         raise ValueError("No `vectors` attribute found."
                          " Supply the word vectors (`.wv`) instead.")
     vocabulary = embedding.index2word
     self.embedding = embedding
     print("Embedding shape:", embedding.vectors.shape)
     TfidfVectorizer.__init__(self, vocabulary=vocabulary, **kwargs)
Esempio n. 6
0
    def __init__(self,
                 input="content",
                 encoding="utf-8",
                 decode_error="strict",
                 strip_accents=None,
                 lowercase=True,
                 preprocessor=None,
                 tokenizer=None,
                 analyzer="word",
                 stop_words=None,
                 token_pattern="(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 max_features=None,
                 vocabulary=None,
                 binary=False,
                 dtype=numpy.int64,
                 norm="l2",
                 use_idf=True,
                 smooth_idf=True,
                 sublinear_tf=False,
                 progress_bar_resolution_seconds=.333,
                 progress_bar_clear_when_done=False):

        TfidfVectorizer.__init__(self,
                                 input=input,
                                 encoding=encoding,
                                 decode_error=decode_error,
                                 strip_accents=strip_accents,
                                 lowercase=lowercase,
                                 preprocessor=preprocessor,
                                 tokenizer=tokenizer,
                                 stop_words=stop_words,
                                 token_pattern=token_pattern,
                                 ngram_range=ngram_range,
                                 analyzer=analyzer,
                                 max_df=max_df,
                                 min_df=min_df,
                                 max_features=max_features,
                                 vocabulary=vocabulary,
                                 binary=binary,
                                 dtype=dtype,
                                 norm=norm,
                                 use_idf=use_idf,
                                 smooth_idf=smooth_idf,
                                 sublinear_tf=sublinear_tf)

        ProgressBarVectorizer.__init__(self, progress_bar_resolution_seconds,
                                       progress_bar_clear_when_done)
 def __init__(self, fit_col=None, col_name=None, max_df=1.0, min_df=1):
     TfidfVectorizer.__init__(self)
     self.fit_col = fit_col
     self.col_name = col_name
Esempio n. 8
0
 def __init__(self):
     TfidfVectorizer.__init__(self, use_idf=True, smooth_idf=False)
     self._tfidf = TfidfTransformer_2()
Esempio n. 9
0
 def __init__(self, analyzer='word', use_idf=True):
     TfidfVectorizer.__init__(self,
                              analyzer=analyzer,
                              use_idf=use_idf,
                              norm='l2')
     self._fit_X = None
Esempio n. 10
0
import Get_Data as twitterdata
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial

#corpus=["all is well","best of luck","best of luck","best of all"]
corpus = twitterdata.get_data()
n_of_articles = corpus.__len__()
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.__init__(norm=u'l1', smooth_idf=False)

X = vectorizer.fit_transform(corpus)
# print (dict(zip(vectorizer.get_feature_names(), idf)))

feature_names = vectorizer.get_feature_names()
scores_relative_to_comparing = []
ss = []
doc = 1
feature_index = X[doc, :].nonzero()[1]

writer = open('idf.txt', 'w', encoding="utf8")
for i in range(vectorizer.idf_.__len__()):
    writer.write(str(vectorizer.idf_[i]))
    writer.write(" ")
    writer.write(str(feature_names[i]))
    writer.write("\n")
writer.close()

c = twitterdata.files_in_dir
cc = 0
# for i in range(vectorizer.idf_.__len__()):
#     print(feature_names[i])
Esempio n. 11
0
 def __init__(self):
     TfidfVectorizer.__init__(self, use_idf=True,
                              smooth_idf=False)  #, stop_words=stopwords
     self._tfidf = BM25Transformer()
Esempio n. 12
0
 def __init__(self):
      TfidfVectorizer.__init__(self, use_idf = True, smooth_idf=False, ngram_range=(2,2), stop_words=stopwords)
      self._tfidf = BM25Transformer()
Esempio n. 13
0
 def __init__(self, analyzer='word', use_idf=True):
     TfidfVectorizer.__init__(self, analyzer=analyzer, use_idf=use_idf,
                              norm='l2')
     self._fit_X = None
 def __init__(self):
     TfidfVectorizer.__init__(self, tokenizer=identity,
                              preprocessor=None,
                              lowercase=False)