def recommender(query): #load vocab and idfs and data #df = pd.read_csv("song_data.csv") vocabulary = pickle.load(open("vocabulary.pkl", "rb")) idfs = pickle.load(open("idf.pkl", "rb")) tfidf_matrix = pickle.load(open("matrix.pkl","rb")) #tfidf_matrix = np.load("tfidf_matrix.npy") #reform the vectorizer tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english') tf.vocabulary_ = vocabulary tf.idf_ = idfs #query vector vector = tf.transform([query]) print(vector.shape) print(tfidf_matrix.shape) #similarity cos_sim = linear_kernel(tfidf_matrix,vector) res = cos_sim[:,0].argsort()[:-6:-1] #prediction list #pred = [df['songname'][i] for i in res] print(res) return res
def from_pretrained(cls, normal_set="default", abb_expander="default"): model_dir = DEFAULT_DNORM_PATH if not model_dir.parent.is_dir(): model_dir.parent.mkdir() if not model_dir.is_dir(): model_dir.mkdir() if not (model_dir / "normal_set.txt").is_file(): download_fileobj(BASE_URL + "/normal_set.txt", model_dir / "normal_set.txt") if not (model_dir / "W_EHR_all.npz").is_file(): download_fileobj(BASE_URL + "/W_EHR_all.npz", model_dir / "W_EHR_all.npz") if not (model_dir / "EHR_idf3.pkl").is_file(): download_fileobj(BASE_URL + "/EHR_idf3.pkl", model_dir / "EHR_idf3.pkl") if not (model_dir / "abb_dic.csv").is_file(): download_fileobj(BASE_URL + "/abb_dic.csv", model_dir / "abb_dic.csv") mecab = MeCab.Tagger('-Owakati') tokenizer = Tokenizer(mecab.parse, lambda s: s[:-1]) if abb_expander == "default": converter = Converter(str(model_dir / "abb_dic.csv")).convert elif isinstance(abb_expander, str): converter = Converter(abb_expander).convert elif callable(abb_expander): converter = abb_expander else: converter = None if normal_set == "default": normal_set = str(model_dir / "normal_set.txt") normal_set = load_normal_set(normal_set) tfidf = TfidfVectorizer(analyzer=lambda s: s.split(' ')) with open(str(model_dir / "EHR_idf3.pkl"), 'rb') as f: params = pickle.load(f) tfidf.set_params(**params['params']) tfidf.vocabulary_ = params['voc'] tfidf.idf_ = params['idf'] """ with open(str(model_dir / "EHR_idf.pkl"), 'rb') as f: tfidf = pickle.load(f) """ model = cls(tfidf, normal_set, tokenizer.tokenizer, converter) model.load(str(model_dir / "W_EHR_all.npz")) return model
def deserialize(self): idfs = np.asarray(self.obj['idf']) vectorizer = TfidfVectorizer(**self.obj['params']) # Monkey patch in order to indirectly fit a tfidf vectorizer. vectorizer._tfidf._idf_diag = sp.spdiags(idfs, diags=0, m=len(idfs), n=len(idfs)) vectorizer.vocabulary_ = self.obj['vocabulary'] return vectorizer
def main(): np.random.seed(args.seed) print("Reading params.yaml...") params = yaml.safe_load(open("params.yaml"))["train"][args.model] print("Reading training set...") with open(args.sentences_file, "r") as f: corpus = f.readlines() out_dir = Path(args.output_dir) os.makedirs(out_dir, exist_ok=True) if args.model == "tf_idf": model = TfidfVectorizer(**params["init_kwargs"]) print("Training model...") model.fit(corpus) # hack: https://github.com/scikit-learn/scikit-learn/issues/18669 model.vocabulary_ = OrderedDict( sorted(model.vocabulary_.items(), key=lambda kv: kv[1])) model._stop_words_id = 0 print("Saving model to disk...") with (out_dir / "model.pkl").open("wb") as f: pickle.dump(model, f) elif args.model == "count": model = CountVectorizer(**params["init_kwargs"]) print("Training model...") model.fit(corpus) # hack: https://github.com/scikit-learn/scikit-learn/issues/18669 model.vocabulary_ = OrderedDict( sorted(model.vocabulary_.items(), key=lambda kv: kv[1])) model._stop_words_id = 0 print("Saving model to disk...") with (out_dir / "model.pkl").open("wb") as f: pickle.dump(model, f) else: raise ValueError(f"Training not available for model {args.model}!") print("Training completed!")
def load_tfidf(vocab_path, idf_weights_path): """Loads tfidf vectorizer from its components. :param str vocab_path: path to the vectorizer vocabulary JSON. :param str idf_weights_path: path to idf weights JSON. :rtype: sklearn.feature_extraction.text.TfidfVectorizer """ tfidf = TfidfVectorizer(analyzer=lambda x: x, vocabulary=json.load(open(vocab_path))) idf_vector = np.array(json.load(open(idf_weights_path))) tfidf._tfidf._idf_diag = scipy.sparse.diags([idf_vector], [0]) tfidf.vocabulary_ = tfidf.vocabulary return tfidf
def load_model(path=FILES_LOCATION): r = [0] * 6 m = [0] * 6 for i in range(6): r[i] = np.mat(np.load(path + "r" + str(i) + ".npy")) m[i] = joblib.load(path + "m" + str(i) + ".sav") # TF-IDF vectorizer vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1) vec._tfidf._idf_diag = load_obj("idf_diag") # sp.spdiags(idfs, diags = 0, m = len(idfs), n = len(idfs)) vec.vocabulary_ = load_obj("vocabulary") return vec, m, r
def load_model_info(model_info): """Return a longform model from a model info JSON object. Parameters ---------- model_info : dict The JSON object containing the attributes of a model. Returns ------- longform_model : py:class:`adeft.classify.AdeftClassifier` The classifier that was loaded from the given JSON object. """ shortforms = model_info['shortforms'] pos_labels = model_info['pos_labels'] longform_model = AdeftClassifier(shortforms=shortforms, pos_labels=pos_labels) ngram_range = model_info['tfidf']['ngram_range'] tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english') logit = LogisticRegression(multi_class='auto') tfidf.vocabulary_ = model_info['tfidf']['vocabulary_'] tfidf.idf_ = model_info['tfidf']['idf_'] logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64') logit.intercept_ = np.array(model_info['logit']['intercept_']) logit.coef_ = np.array(model_info['logit']['coef_']) estimator = Pipeline([('tfidf', tfidf), ('logit', logit)]) longform_model.estimator = estimator # These attributes do not exist in older adeft models. # For backwards compatibility we check if they are present if 'stats' in model_info: longform_model.stats = model_info['stats'] if 'std' in model_info: longform_model._std = np.array(model_info['std']) if 'timestamp' in model_info: longform_model.timestamp = model_info['timestamp'] if 'training_set_digest' in model_info: longform_model.training_set_digest = model_info['training_set_digest'] if 'params' in model_info: longform_model.params = model_info['params'] if 'version' in model_info: longform_model.version == model_info['version'] if 'confusion_info' in model_info: longform_model.confusion_info = model_info['confusion_info'] if 'other_metadata' in model_info: longform_model.other_metadata = model_info['other_metadata'] return longform_model
def from_path(cls, path, **shared): import numpy as np import scipy.sparse as sp from sklearn.feature_extraction.text import (TfidfTransformer, TfidfVectorizer as SklearnTfidfVectorizer) path = Path(path) model_path = path / "vectorizer.json" if not model_path.exists(): raise LoadingError("Missing vectorizer model file: %s" % model_path.name) with model_path.open("r", encoding="utf-8") as f: vectorizer_dict = json.load(f) vectorizer = cls(vectorizer_dict["config"], **shared) vectorizer._language = vectorizer_dict["language_code"] builtin_entity_scope = vectorizer_dict["builtin_entity_scope"] if builtin_entity_scope is not None: builtin_entity_scope = set(builtin_entity_scope) vectorizer.builtin_entity_scope = builtin_entity_scope vectorizer_ = vectorizer_dict["vectorizer"] if vectorizer_: vocab = vectorizer_["vocab"] idf_diag_data = vectorizer_["idf_diag"] idf_diag_data = np.array(idf_diag_data) idf_diag_shape = (len(idf_diag_data), len(idf_diag_data)) row = list(range(idf_diag_shape[0])) col = list(range(idf_diag_shape[0])) idf_diag = sp.csr_matrix((idf_diag_data, (row, col)), shape=idf_diag_shape) tfidf_transformer = TfidfTransformer() tfidf_transformer._idf_diag = idf_diag vectorizer_ = SklearnTfidfVectorizer( tokenizer=lambda x: tokenize_light(x, vectorizer._language)) vectorizer_.vocabulary_ = vocab vectorizer_._tfidf = tfidf_transformer vectorizer._tfidf_vectorizer = vectorizer_ return vectorizer
def create(): # SamplesDatabase.set_file('samples_database.pk') db = SamplesDatabase.get() packages = db.filter(('lang', '==', 'en')) # [:20000] print("creating model") min_df_pct = 0.002 max_df_pct = 0.4 min_df = int(len(packages) * min_df_pct) max_df = int(len(packages) * max_df_pct) UnStemmer.enabled = True tfidf_model = TfidfVectorizer( tokenizer=meta_data_description_tokenize, min_df=min_df, max_df=max_df, ngram_range=(1, 3), lowercase=False, # done in tokenize function stop_words=get_stopwords_list()) tfidf_model.fit(packages) u = UnStemmer.get() print("transforming data") tfidf_data = tfidf_model.transform(packages) print("saving model: ", config.TFIDFModels.description_model_2) tfidf_model.vocabulary_ = { " ".join(map(lambda x: u.resolve(x), k.split(" "))): v for k, v in tfidf_model.vocabulary_.items() } pickle.dump(tfidf_model, open(config.TFIDFModels.description_model_2, "wb")) print("saved") print("saving data: ", config.TFIDFModels.description_data_2) save_data = {'ids': packages, 'data': tfidf_data} pickle.dump(save_data, open(config.TFIDFModels.description_data_2, "wb")) print("saved")
def get_tf_idf_testing(train_vocabulary, testing_set, vocabulary=None): # tokenize vectorizer = TfidfVectorizer( stop_words='english', analyzer='word', smooth_idf=False, # max_df=3000, # min_df=100, ngram_range=(1, 2), vocabulary=train_vocabulary, # tokenizer=LemmaAndStemTokenizer() ) vectorizer.vocabulary_ = train_vocabulary tfidf = vectorizer.fit_transform(testing_set) return tfidf # vectorizer = CountVectorizer(max_features=max_features, binary=binary) # def get_tf_idf_training(training_set, vocabulary=None): # # tfidf = vectorizer.fit_transform(training_set) # # return tfidf, vectorizer.vocabulary_ # # # def get_tf_idf_testing(train_vocabulary, testing_set, vocabulary=None): # # tokenize # # vectorizer = TfidfVectorizer( # stop_words='english', # analyzer='word', # vocabulary=train_vocabulary, # # tokenizer=LemmaAndStemTokenizer() # ) # # vectorizer.vocabulary_ = train_vocabulary # # tfidf = vectorizer.fit_transform(testing_set) # # return tfidf
def home(): jobs_data = pandas.read_csv( 'https://raw.githubusercontent.com/Nexus-404/object_detection_demo/master/data-6.csv' ) query = request.form['query'] job = list() link = list() descript = list() tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(jobs_data['Description']) pickle.dump(tfidf_matrix, open("matrix.npy", "wb")) pickle.dump(tf.vocabulary_, open("vocabulary.pkl", "wb")) pickle.dump(tf.idf_, open("idf.pkl", "wb")) vocabulary = pickle.load(open("vocabulary.pkl", "rb")) idfs = pickle.load(open("idf.pkl", "rb")) tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') tf.vocabulary_ = vocabulary tf.idf_ = idfs tfidf_matrix = np.load("matrix.npy", allow_pickle=True) vector = tf.transform([query]) cos_sim = linear_kernel(tfidf_matrix, vector) res = cos_sim[:, 0].argsort()[:-11:-1] for i in res: job.append(jobs_data['Jobs'][i]) link.append(jobs_data['Job Url'][i]) descript.append(jobs_data['Description'][i]) return render_template("/readpdf.html", jobs=job, links=link, job_description=descript, query=query)
def load_model(serialization_dir): with open(os.path.join(args.model, "best_hyperparameters.json"), 'r') as f: hyperparameters = json.load(f) if hyperparameters.pop('stopwords') == 1: stop_words = 'english' else: stop_words = None weight = hyperparameters.pop('weight') if weight == 'binary': binary = True else: binary = False ngram_range = hyperparameters.pop('ngram_range') ngram_range = sorted([int(x) for x in ngram_range.split()]) if weight == 'tf-idf': vect = TfidfVectorizer(stop_words=stop_words, lowercase=True, ngram_range=ngram_range) else: vect = CountVectorizer(binary=binary, stop_words=stop_words, lowercase=True, ngram_range=ngram_range) with open(os.path.join(args.model, "vocab.json"), 'r') as f: vocab = json.load(f) vect.vocabulary_ = vocab hyperparameters['C'] = float(hyperparameters['C']) hyperparameters['tol'] = float(hyperparameters['tol']) classifier = LogisticRegression(**hyperparameters) if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")): vect.idf_ = np.load( os.path.join(serialization_dir, "archive", "idf.npy")) classifier.coef_ = np.load( os.path.join(serialization_dir, "archive", "coef.npy")) classifier.intercept_ = np.load( os.path.join(serialization_dir, "archive", "intercept.npy")) classifier.classes_ = np.load( os.path.join(serialization_dir, "archive", "classes.npy")) return classifier, vect
def load_model_info(model_info): """Return a longform model from a model info JSON object. Parameters ---------- model_info : dict The JSON object containing the attributes of a model. Returns ------- longform_model : py:class:`adeft.classify.AdeftClassifier` The classifier that was loaded from the given JSON object. """ shortforms = model_info['shortforms'] pos_labels = model_info['pos_labels'] longform_model = AdeftClassifier(shortforms=shortforms, pos_labels=pos_labels) ngram_range = model_info['tfidf']['ngram_range'] tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english') logit = LogisticRegression(multi_class='auto') tfidf.vocabulary_ = model_info['tfidf']['vocabulary_'] tfidf.idf_ = model_info['tfidf']['idf_'] logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64') logit.intercept_ = np.array(model_info['logit']['intercept_']) logit.coef_ = np.array(model_info['logit']['coef_']) estimator = Pipeline([('tfidf', tfidf), ('logit', logit)]) longform_model.estimator = estimator # Load model statistics if they are available if 'stats' in model_info: longform_model.stats = model_info['stats'] # Load standard deviations for calculating feature importances # if they are available if 'std' in model_info: longform_model._std = np.array(model_info['std']) return longform_model
def train_tfidf(self, tokenizer='custom', corpus='news'): if tokenizer == 'custom': tokenizer = self.tokenize nltk_corpus = [] if corpus == 'all': nltk_corpus += [ nltk.corpus.gutenberg.raw(f_id) for f_id in nltk.corpus.gutenberg.fileids() ] nltk_corpus += [ nltk.corpus.webtext.raw(f_id) for f_id in nltk.corpus.webtext.fileids() ] nltk_corpus += [ nltk.corpus.brown.raw(f_id) for f_id in nltk.corpus.brown.fileids() ] nltk_corpus += [ nltk.corpus.reuters.raw(f_id) for f_id in nltk.corpus.reuters.fileids() ] elif corpus == 'news': nltk_corpus += self.get_bbc_news_corpus() if self.verbose: print "LENGTH nltk corpus corpus: {}".format( sum([len(d) for d in nltk_corpus])) vectorizer = TfidfVectorizer(max_df=1.0, min_df=2, encoding='utf-8', decode_error='strict', max_features=None, stop_words='english', ngram_range=(1, 3), norm='l2', tokenizer=tokenizer, use_idf=True, sublinear_tf=False) #vectorizer.fit_transform(nltk_corpus) vectorizer.fit(nltk_corpus) # Avoid having to pickle instance methods, we will set this method on on load vectorizer.tokenizer = None keys = np.array(vectorizer.vocabulary_.keys(), dtype=str) values = np.array(vectorizer.vocabulary_.values(), dtype=int) stop_words = np.array(list(vectorizer.stop_words_), dtype=str) with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'w') as f: atom = tables.Atom.from_dtype(keys.dtype) ds = f.createCArray(f.root, 'keys', atom, keys.shape) ds[:] = keys with tables.openFile(self.data_path + 'tfidf_values.hdf', 'w') as f: atom = tables.Atom.from_dtype(values.dtype) ds = f.createCArray(f.root, 'values', atom, values.shape) ds[:] = values with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'w') as f: atom = tables.Atom.from_dtype(stop_words.dtype) ds = f.createCArray(f.root, 'stop_words', atom, stop_words.shape) ds[:] = stop_words vectorizer.vocabulary_ = None vectorizer.stop_words_ = None with open(self.data_path + 'tfidf.pkl', 'wb') as fin: cPickle.dump(vectorizer, fin) vectorizer.vocabulary_ = dict(zip(keys, values)) vectorizer.stop_words_ = stop_words return vectorizer
'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': 'english', 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None }) vectorizer.idf_ = np.fromfile('idf.npy') with open('vocabulary.json') as f: vectorizer.vocabulary_ = json.load(f) n_features = len(vectorizer.idf_) clf = LogisticRegression( **{ 'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': 0,
def train_tfidf(self, tokenizer='custom', corpus='news'): if tokenizer == 'custom': tokenizer = self.tokenize nltk_corpus = [] if corpus == 'all': nltk_corpus += [nltk.corpus.gutenberg.raw(f_id) for f_id in nltk.corpus.gutenberg.fileids()] nltk_corpus += [nltk.corpus.webtext.raw(f_id) for f_id in nltk.corpus.webtext.fileids()] nltk_corpus += [nltk.corpus.brown.raw(f_id) for f_id in nltk.corpus.brown.fileids()] nltk_corpus += [nltk.corpus.reuters.raw(f_id) for f_id in nltk.corpus.reuters.fileids()] elif corpus == 'news': nltk_corpus += self.get_bbc_news_corpus() if self.verbose: print "LENGTH nltk corpus corpus: {}".format(sum([len(d) for d in nltk_corpus])) vectorizer = TfidfVectorizer( max_df=1.0, min_df=2, encoding='utf-8', decode_error='strict', max_features=None, stop_words='english', ngram_range=(1, 3), norm='l2', tokenizer=tokenizer, use_idf=True, sublinear_tf=False) #vectorizer.fit_transform(nltk_corpus) vectorizer.fit(nltk_corpus) # Avoid having to pickle instance methods, we will set this method on on load vectorizer.tokenizer = None keys = np.array(vectorizer.vocabulary_.keys(), dtype=str) values = np.array(vectorizer.vocabulary_.values(), dtype=int) stop_words = np.array(list(vectorizer.stop_words_), dtype=str) with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'w') as f: atom = tables.Atom.from_dtype(keys.dtype) ds = f.createCArray(f.root, 'keys', atom, keys.shape) ds[:] = keys with tables.openFile(self.data_path + 'tfidf_values.hdf', 'w') as f: atom = tables.Atom.from_dtype(values.dtype) ds = f.createCArray(f.root, 'values', atom, values.shape) ds[:] = values with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'w') as f: atom = tables.Atom.from_dtype(stop_words.dtype) ds = f.createCArray(f.root, 'stop_words', atom, stop_words.shape) ds[:] = stop_words vectorizer.vocabulary_ = None vectorizer.stop_words_ = None with open(self.data_path + 'tfidf.pkl', 'wb') as fin: cPickle.dump(vectorizer, fin) vectorizer.vocabulary_ = dict(zip(keys, values)) vectorizer.stop_words_ = stop_words return vectorizer