def recommender(query): #load vocab and idfs and data #df = pd.read_csv("song_data.csv") vocabulary = pickle.load(open("vocabulary.pkl", "rb")) idfs = pickle.load(open("idf.pkl", "rb")) tfidf_matrix = pickle.load(open("matrix.pkl","rb")) #tfidf_matrix = np.load("tfidf_matrix.npy") #reform the vectorizer tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english') tf.vocabulary_ = vocabulary tf.idf_ = idfs #query vector vector = tf.transform([query]) print(vector.shape) print(tfidf_matrix.shape) #similarity cos_sim = linear_kernel(tfidf_matrix,vector) res = cos_sim[:,0].argsort()[:-6:-1] #prediction list #pred = [df['songname'][i] for i in res] print(res) return res
def fdt_setup(request): global __model, __generator # create a temp model __generator = Faker() vocab = list(set(__generator.words(500))) vectorizer = TfidfVectorizer(vocabulary=vocab) vectorizer.idf_ = [randint(1, 15) for i in range(len(vocab))] model = { "vocab": vocab, "stop_words": list(set(__generator.words(10))), "intercept": [1], "idf_": vectorizer.idf_, "lr": [random() for i in range(len(vocab))], "vectorizer": [vectorizer] } np.savez('/tmp/model', **model) __model = Loader('/tmp').load_model('model') def teardown(): __model = None __generator = None # delete model file os.remove("/tmp/model.npz") request.addfinalizer(teardown)
def from_pretrained(cls, normal_set="default", abb_expander="default"): model_dir = DEFAULT_DNORM_PATH if not model_dir.parent.is_dir(): model_dir.parent.mkdir() if not model_dir.is_dir(): model_dir.mkdir() if not (model_dir / "normal_set.txt").is_file(): download_fileobj(BASE_URL + "/normal_set.txt", model_dir / "normal_set.txt") if not (model_dir / "W_EHR_all.npz").is_file(): download_fileobj(BASE_URL + "/W_EHR_all.npz", model_dir / "W_EHR_all.npz") if not (model_dir / "EHR_idf3.pkl").is_file(): download_fileobj(BASE_URL + "/EHR_idf3.pkl", model_dir / "EHR_idf3.pkl") if not (model_dir / "abb_dic.csv").is_file(): download_fileobj(BASE_URL + "/abb_dic.csv", model_dir / "abb_dic.csv") mecab = MeCab.Tagger('-Owakati') tokenizer = Tokenizer(mecab.parse, lambda s: s[:-1]) if abb_expander == "default": converter = Converter(str(model_dir / "abb_dic.csv")).convert elif isinstance(abb_expander, str): converter = Converter(abb_expander).convert elif callable(abb_expander): converter = abb_expander else: converter = None if normal_set == "default": normal_set = str(model_dir / "normal_set.txt") normal_set = load_normal_set(normal_set) tfidf = TfidfVectorizer(analyzer=lambda s: s.split(' ')) with open(str(model_dir / "EHR_idf3.pkl"), 'rb') as f: params = pickle.load(f) tfidf.set_params(**params['params']) tfidf.vocabulary_ = params['voc'] tfidf.idf_ = params['idf'] """ with open(str(model_dir / "EHR_idf.pkl"), 'rb') as f: tfidf = pickle.load(f) """ model = cls(tfidf, normal_set, tokenizer.tokenizer, converter) model.load(str(model_dir / "W_EHR_all.npz")) return model
def test_tfidf_vectorizer_setter(): orig = TfidfVectorizer(use_idf=True) orig.fit(JUNK_FOOD_DOCS) copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True) copy.idf_ = orig.idf_ assert_array_equal( copy.transform(JUNK_FOOD_DOCS).toarray(), orig.transform(JUNK_FOOD_DOCS).toarray())
def test_tfidf_vectorizer_setter(): orig = TfidfVectorizer(use_idf=True) orig.fit(JUNK_FOOD_DOCS) copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True) copy.idf_ = orig.idf_ assert_array_equal( copy.transform(JUNK_FOOD_DOCS).toarray(), orig.transform(JUNK_FOOD_DOCS).toarray())
def get_fitted_tv(): with open(project_path / "output/news/tv_vocabulary_.txt") as f: vocabulary_ = { line.split()[0]: int(line.split()[1]) for line in f.readlines() } with open(project_path / "output/news/tv_stop_words_.txt") as f: stop_words_ = list(f.readlines()) idf_ = np.loadtxt(project_path / "output/news/tv_idf_.txt") tv = TfidfVectorizer(stop_words=stop_words_, vocabulary=vocabulary_) tv.idf_ = idf_ return tv
def load_model_info(model_info): """Return a longform model from a model info JSON object. Parameters ---------- model_info : dict The JSON object containing the attributes of a model. Returns ------- longform_model : py:class:`adeft.classify.AdeftClassifier` The classifier that was loaded from the given JSON object. """ shortforms = model_info['shortforms'] pos_labels = model_info['pos_labels'] longform_model = AdeftClassifier(shortforms=shortforms, pos_labels=pos_labels) ngram_range = model_info['tfidf']['ngram_range'] tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english') logit = LogisticRegression(multi_class='auto') tfidf.vocabulary_ = model_info['tfidf']['vocabulary_'] tfidf.idf_ = model_info['tfidf']['idf_'] logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64') logit.intercept_ = np.array(model_info['logit']['intercept_']) logit.coef_ = np.array(model_info['logit']['coef_']) estimator = Pipeline([('tfidf', tfidf), ('logit', logit)]) longform_model.estimator = estimator # These attributes do not exist in older adeft models. # For backwards compatibility we check if they are present if 'stats' in model_info: longform_model.stats = model_info['stats'] if 'std' in model_info: longform_model._std = np.array(model_info['std']) if 'timestamp' in model_info: longform_model.timestamp = model_info['timestamp'] if 'training_set_digest' in model_info: longform_model.training_set_digest = model_info['training_set_digest'] if 'params' in model_info: longform_model.params = model_info['params'] if 'version' in model_info: longform_model.version == model_info['version'] if 'confusion_info' in model_info: longform_model.confusion_info = model_info['confusion_info'] if 'other_metadata' in model_info: longform_model.other_metadata = model_info['other_metadata'] return longform_model
def home(): jobs_data = pandas.read_csv( 'https://raw.githubusercontent.com/Nexus-404/object_detection_demo/master/data-6.csv' ) query = request.form['query'] job = list() link = list() descript = list() tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(jobs_data['Description']) pickle.dump(tfidf_matrix, open("matrix.npy", "wb")) pickle.dump(tf.vocabulary_, open("vocabulary.pkl", "wb")) pickle.dump(tf.idf_, open("idf.pkl", "wb")) vocabulary = pickle.load(open("vocabulary.pkl", "rb")) idfs = pickle.load(open("idf.pkl", "rb")) tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') tf.vocabulary_ = vocabulary tf.idf_ = idfs tfidf_matrix = np.load("matrix.npy", allow_pickle=True) vector = tf.transform([query]) cos_sim = linear_kernel(tfidf_matrix, vector) res = cos_sim[:, 0].argsort()[:-11:-1] for i in res: job.append(jobs_data['Jobs'][i]) link.append(jobs_data['Job Url'][i]) descript.append(jobs_data['Description'][i]) return render_template("/readpdf.html", jobs=job, links=link, job_description=descript, query=query)
def load_model(serialization_dir): with open(os.path.join(args.model, "best_hyperparameters.json"), 'r') as f: hyperparameters = json.load(f) if hyperparameters.pop('stopwords') == 1: stop_words = 'english' else: stop_words = None weight = hyperparameters.pop('weight') if weight == 'binary': binary = True else: binary = False ngram_range = hyperparameters.pop('ngram_range') ngram_range = sorted([int(x) for x in ngram_range.split()]) if weight == 'tf-idf': vect = TfidfVectorizer(stop_words=stop_words, lowercase=True, ngram_range=ngram_range) else: vect = CountVectorizer(binary=binary, stop_words=stop_words, lowercase=True, ngram_range=ngram_range) with open(os.path.join(args.model, "vocab.json"), 'r') as f: vocab = json.load(f) vect.vocabulary_ = vocab hyperparameters['C'] = float(hyperparameters['C']) hyperparameters['tol'] = float(hyperparameters['tol']) classifier = LogisticRegression(**hyperparameters) if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")): vect.idf_ = np.load( os.path.join(serialization_dir, "archive", "idf.npy")) classifier.coef_ = np.load( os.path.join(serialization_dir, "archive", "coef.npy")) classifier.intercept_ = np.load( os.path.join(serialization_dir, "archive", "intercept.npy")) classifier.classes_ = np.load( os.path.join(serialization_dir, "archive", "classes.npy")) return classifier, vect
def load_model_info(model_info): """Return a longform model from a model info JSON object. Parameters ---------- model_info : dict The JSON object containing the attributes of a model. Returns ------- longform_model : py:class:`adeft.classify.AdeftClassifier` The classifier that was loaded from the given JSON object. """ shortforms = model_info['shortforms'] pos_labels = model_info['pos_labels'] longform_model = AdeftClassifier(shortforms=shortforms, pos_labels=pos_labels) ngram_range = model_info['tfidf']['ngram_range'] tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english') logit = LogisticRegression(multi_class='auto') tfidf.vocabulary_ = model_info['tfidf']['vocabulary_'] tfidf.idf_ = model_info['tfidf']['idf_'] logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64') logit.intercept_ = np.array(model_info['logit']['intercept_']) logit.coef_ = np.array(model_info['logit']['coef_']) estimator = Pipeline([('tfidf', tfidf), ('logit', logit)]) longform_model.estimator = estimator # Load model statistics if they are available if 'stats' in model_info: longform_model.stats = model_info['stats'] # Load standard deviations for calculating feature importances # if they are available if 'std' in model_info: longform_model._std = np.array(model_info['std']) return longform_model
def predict_label(signatures, prob3_path, vocabulary_path, idf_path, cluster_centers_path, clasnum2labels_path): func_names = [] func_types = [] func_comments = [] for name, types, comments in ( x for x in map(lambda l: l.strip().split("|"), signatures)): func_types.append(types.split(",")) func_comments.append(comments.split(",")) func_names.append(name) def tokenize_funcs(funcs): oneword = re.compile(r"^[a-z][a-z0-9]+|[A-Z][A-Z0-9]$") difCase = re.compile( r".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)") under_scores_split = re.compile(r"_") tokenized_funcs = [] for f in funcs: if oneword.fullmatch(f): tokenized_funcs.append([f]) elif "_" in f: tokenized_funcs.append( [w for w in under_scores_split.split(f) if w]) else: tokenized_funcs.append( [w.group(0) for w in difCase.finditer(f) if w.group(0)]) return tokenized_funcs tokenized_func_names = tokenize_funcs(func_names) tokenized_func_names = [ tok_name + tok_comm for tok_name, tok_comm in zip(tokenized_func_names, func_comments) ] def drop_wrong_symbols(tokenized_func_names): # first approach to drop all digits, second only if > 50% wrong_char = re.compile(r"[\d]") tokenized_func_names_ = [] for tokenized_name in tokenized_func_names: processed_tokens = [ wrong_char.sub("", token).lower() for token in tokenized_name if wrong_char.sub("", token) ] tokenized_func_names_.append(processed_tokens) return tokenized_func_names_ tokenized_func_names = drop_wrong_symbols(tokenized_func_names) with open(prob3_path, "rb") as f: prob3 = pickle.load(f) def split(word, start=1, end=20): return ((word[:i], word[i:]) for i in range(start, min(len(word) + 1, end))) @functools.lru_cache(maxsize=10000) def segment(word, maxlen=500): if not word: return [] if len(word) > maxlen: return segment(word[:maxlen]) + segment(word[maxlen:]) candidates = ([first] + segment(remaining) for first, remaining in split(word)) return max( candidates, key=lambda x: functools.reduce(operator.__mul__, map(prob3, x), 1)) def segmentize_corpus(tokenized_func_names, segmenter): tokenized_func_names = [ list(it.chain(*(segmenter(token) for token in tokens))) for tokens in tokenized_func_names ] return tokenized_func_names tokenized_func_names = segmentize_corpus(tokenized_func_names, segment) def lemmatize_corpus(tokenized_func_names): nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) lengths = np.cumsum([0] + list(map(len, tokenized_func_names))) flat_tokens = list(it.chain(*tokenized_func_names)) doc = spacy.tokens.Doc(nlp.vocab, words=flat_tokens) tokenized_func_names = [[ token.lemma_ for token in doc[lengths[i - 1]:lengths[i]] ] for i in range(1, len(tokenized_func_names) + 1)] return tokenized_func_names tokenized_func_names = lemmatize_corpus(tokenized_func_names) with open(vocabulary_path, "rb") as f: vocab = pickle.load(f) def prune_names(tokenized_func_names, vocab): tokenized_func_names_ = [] for tokenized_name in tokenized_func_names: processed_tokens = [ token for token in tokenized_name if token in vocab ] tokenized_func_names_.append(processed_tokens) return tokenized_func_names_ tokenized_func_names = prune_names(tokenized_func_names, set(vocab.keys())) def tokenize_types(func_types): type_set = { "int", "unsigned int", "char", "unsigned char", "enum", "struct", "void", "long", "unsigned long", "float", "double", "short", "unsigned short", "bool", "union", "long long", "unsigned long long" } type_dict = {re.compile(t): t for t in type_set} re_drop = re.compile(r"\*|restrict|const") struct_type = re.compile("struct") tokenized_types = [[0 for _ in range(len(f_types))] for f_types in func_types] for i, f_types in enumerate(func_types): for j, type in enumerate(f_types): cleaned_type = re_drop.sub("", type) for re_t, t in type_dict.items(): if re.search(re_t, cleaned_type): tokenized_types[i][j] = t break else: tokenized_types[i][j] = type_dict[struct_type] return tokenized_types tokenized_func_types = tokenize_types(func_types) tokenized_features = [ tok_name + tok_types for tok_name, tok_types in zip( tokenized_func_names, tokenized_func_types) ] idf = np.load(idf_path) tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, sublinear_tf=True, vocabulary=vocab) tfidf_vectorizer.idf_ = idf tfidf_matrix = tfidf_vectorizer.transform(tokenized_features) centers = np.load(cluster_centers_path) model = KMeans(centers.shape[0]) model._n_threads = 1 model.cluster_centers_ = centers cluster_nums = model.predict(tfidf_matrix) with open(clasnum2labels_path, "rb") as f: clasnum2labels = pickle.load(f) labels = [clasnum2labels[c] for c in cluster_nums] return {n: l for n, l in zip(func_names, labels) if l != "unknown"}
'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': 'english', 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None }) vectorizer.idf_ = np.fromfile('idf.npy') with open('vocabulary.json') as f: vectorizer.vocabulary_ = json.load(f) n_features = len(vectorizer.idf_) clf = LogisticRegression( **{ 'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None,