def __init__(self, embedding, **kwargs): """TODO: to be defined1. """ # list of words in the embedding vocabulary = embedding.index2word self.embedding = embedding print("Embedding shape:", embedding.syn0.shape) TfidfVectorizer.__init__(self, vocabulary=vocabulary, **kwargs)
def __init__(self, corpus, model, n, epoch, token_pattern, ngram_range): TfidfVectorizer.__init__(self) self.token_pattern = token_pattern self.ngram_range = ngram_range self._corpus = corpus self._model = model self._n = n self._epoch = epoch
def __init__( self, data_frame: pd.DataFrame, description_col: str, name_col: str, **kwarg, ) -> None: self.name_col = name_col self.description_col = description_col TfidfVectorizer.__init__(self, **kwarg) ItemBasedRS.__init__(self, base_data_frame=data_frame)
def __init__(self, embedding, **kwargs): """TODO: to be defined1. """ # list of words in the embedding if not hasattr(embedding, 'index2word'): raise ValueError("No `index2word` attribute found." " Supply the word vectors (`.wv`) instead.") if not hasattr(embedding, 'vectors'): raise ValueError("No `vectors` attribute found." " Supply the word vectors (`.wv`) instead.") vocabulary = embedding.index2word self.embedding = embedding print("Embedding shape:", embedding.vectors.shape) TfidfVectorizer.__init__(self, vocabulary=vocabulary, **kwargs)
def __init__(self, embedding, **kwargs): """TODO: to be defined1. """ # list of words in the embedding if not hasattr(embedding, 'index2word'): raise ValueError("No `index2word` attribute found." " Supply the word vectors (`.wv`) instead.") if not hasattr(embedding, 'vectors'): raise ValueError("No `vectors` attribute found." " Supply the word vectors (`.wv`) instead.") vocabulary = embedding.index2word self.embedding = embedding print("Embedding shape:", embedding.vectors.shape) TfidfVectorizer.__init__(self, vocabulary=vocabulary, **kwargs)
def __init__(self, input="content", encoding="utf-8", decode_error="strict", strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer="word", stop_words=None, token_pattern="(?u)\b\w\w+\b", ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=numpy.int64, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False, progress_bar_resolution_seconds=.333, progress_bar_clear_when_done=False): TfidfVectorizer.__init__(self, input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, analyzer=analyzer, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype, norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) ProgressBarVectorizer.__init__(self, progress_bar_resolution_seconds, progress_bar_clear_when_done)
def __init__(self, fit_col=None, col_name=None, max_df=1.0, min_df=1): TfidfVectorizer.__init__(self) self.fit_col = fit_col self.col_name = col_name
def __init__(self): TfidfVectorizer.__init__(self, use_idf=True, smooth_idf=False) self._tfidf = TfidfTransformer_2()
def __init__(self, analyzer='word', use_idf=True): TfidfVectorizer.__init__(self, analyzer=analyzer, use_idf=use_idf, norm='l2') self._fit_X = None
import Get_Data as twitterdata from sklearn.feature_extraction.text import TfidfVectorizer from scipy import spatial #corpus=["all is well","best of luck","best of luck","best of all"] corpus = twitterdata.get_data() n_of_articles = corpus.__len__() vectorizer = TfidfVectorizer(min_df=1) vectorizer.__init__(norm=u'l1', smooth_idf=False) X = vectorizer.fit_transform(corpus) # print (dict(zip(vectorizer.get_feature_names(), idf))) feature_names = vectorizer.get_feature_names() scores_relative_to_comparing = [] ss = [] doc = 1 feature_index = X[doc, :].nonzero()[1] writer = open('idf.txt', 'w', encoding="utf8") for i in range(vectorizer.idf_.__len__()): writer.write(str(vectorizer.idf_[i])) writer.write(" ") writer.write(str(feature_names[i])) writer.write("\n") writer.close() c = twitterdata.files_in_dir cc = 0 # for i in range(vectorizer.idf_.__len__()): # print(feature_names[i])
def __init__(self): TfidfVectorizer.__init__(self, use_idf=True, smooth_idf=False) #, stop_words=stopwords self._tfidf = BM25Transformer()
def __init__(self): TfidfVectorizer.__init__(self, use_idf = True, smooth_idf=False, ngram_range=(2,2), stop_words=stopwords) self._tfidf = BM25Transformer()
def __init__(self, analyzer='word', use_idf=True): TfidfVectorizer.__init__(self, analyzer=analyzer, use_idf=use_idf, norm='l2') self._fit_X = None
def __init__(self): TfidfVectorizer.__init__(self, tokenizer=identity, preprocessor=None, lowercase=False)