def recommender(query):

	#load vocab and idfs and data
	#df = pd.read_csv("song_data.csv")
	vocabulary = pickle.load(open("vocabulary.pkl", "rb"))
	idfs = pickle.load(open("idf.pkl", "rb"))	
	tfidf_matrix = pickle.load(open("matrix.pkl","rb"))
	#tfidf_matrix = np.load("tfidf_matrix.npy")
	
	#reform the vectorizer
	tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
	tf.vocabulary_ = vocabulary
	tf.idf_ = idfs

	#query vector	
	vector = tf.transform([query])
	
	print(vector.shape)
	print(tfidf_matrix.shape)
	#similarity	
	cos_sim = linear_kernel(tfidf_matrix,vector)
	res = cos_sim[:,0].argsort()[:-6:-1]
	
	#prediction list
	#pred = [df['songname'][i] for i in res] 
	print(res)	
	
	return res
Example #2
0
    def from_pretrained(cls, normal_set="default", abb_expander="default"):

        model_dir = DEFAULT_DNORM_PATH

        if not model_dir.parent.is_dir():
            model_dir.parent.mkdir()

        if not model_dir.is_dir():
            model_dir.mkdir()

        if not (model_dir / "normal_set.txt").is_file():
            download_fileobj(BASE_URL + "/normal_set.txt",
                             model_dir / "normal_set.txt")

        if not (model_dir / "W_EHR_all.npz").is_file():
            download_fileobj(BASE_URL + "/W_EHR_all.npz",
                             model_dir / "W_EHR_all.npz")

        if not (model_dir / "EHR_idf3.pkl").is_file():
            download_fileobj(BASE_URL + "/EHR_idf3.pkl",
                             model_dir / "EHR_idf3.pkl")

        if not (model_dir / "abb_dic.csv").is_file():
            download_fileobj(BASE_URL + "/abb_dic.csv",
                             model_dir / "abb_dic.csv")

        mecab = MeCab.Tagger('-Owakati')
        tokenizer = Tokenizer(mecab.parse, lambda s: s[:-1])

        if abb_expander == "default":
            converter = Converter(str(model_dir / "abb_dic.csv")).convert
        elif isinstance(abb_expander, str):
            converter = Converter(abb_expander).convert
        elif callable(abb_expander):
            converter = abb_expander
        else:
            converter = None

        if normal_set == "default":
            normal_set = str(model_dir / "normal_set.txt")

        normal_set = load_normal_set(normal_set)

        tfidf = TfidfVectorizer(analyzer=lambda s: s.split(' '))

        with open(str(model_dir / "EHR_idf3.pkl"), 'rb') as f:
            params = pickle.load(f)
        tfidf.set_params(**params['params'])
        tfidf.vocabulary_ = params['voc']
        tfidf.idf_ = params['idf']
        """
        with open(str(model_dir / "EHR_idf.pkl"), 'rb') as f:
            tfidf = pickle.load(f)
        """

        model = cls(tfidf, normal_set, tokenizer.tokenizer, converter)
        model.load(str(model_dir / "W_EHR_all.npz"))

        return model
 def deserialize(self):
     idfs = np.asarray(self.obj['idf'])
     vectorizer = TfidfVectorizer(**self.obj['params'])
     # Monkey patch in order to indirectly fit a tfidf vectorizer.
     vectorizer._tfidf._idf_diag = sp.spdiags(idfs,
                                              diags=0,
                                              m=len(idfs),
                                              n=len(idfs))
     vectorizer.vocabulary_ = self.obj['vocabulary']
     return vectorizer
Example #4
0
def main():
    np.random.seed(args.seed)

    print("Reading params.yaml...")
    params = yaml.safe_load(open("params.yaml"))["train"][args.model]

    print("Reading training set...")
    with open(args.sentences_file, "r") as f:
        corpus = f.readlines()

    out_dir = Path(args.output_dir)
    os.makedirs(out_dir, exist_ok=True)

    if args.model == "tf_idf":
        model = TfidfVectorizer(**params["init_kwargs"])
        print("Training model...")
        model.fit(corpus)
        # hack: https://github.com/scikit-learn/scikit-learn/issues/18669
        model.vocabulary_ = OrderedDict(
            sorted(model.vocabulary_.items(), key=lambda kv: kv[1]))
        model._stop_words_id = 0
        print("Saving model to disk...")
        with (out_dir / "model.pkl").open("wb") as f:
            pickle.dump(model, f)
    elif args.model == "count":
        model = CountVectorizer(**params["init_kwargs"])
        print("Training model...")
        model.fit(corpus)
        # hack: https://github.com/scikit-learn/scikit-learn/issues/18669
        model.vocabulary_ = OrderedDict(
            sorted(model.vocabulary_.items(), key=lambda kv: kv[1]))
        model._stop_words_id = 0
        print("Saving model to disk...")
        with (out_dir / "model.pkl").open("wb") as f:
            pickle.dump(model, f)
    else:
        raise ValueError(f"Training not available for model {args.model}!")

    print("Training completed!")
def load_tfidf(vocab_path, idf_weights_path):
    """Loads tfidf vectorizer from its components.
    :param str vocab_path: path to the vectorizer vocabulary JSON.
    :param str idf_weights_path: path to idf weights JSON.
    :rtype: sklearn.feature_extraction.text.TfidfVectorizer

    """
    tfidf = TfidfVectorizer(analyzer=lambda x: x,
                            vocabulary=json.load(open(vocab_path)))
    idf_vector = np.array(json.load(open(idf_weights_path)))
    tfidf._tfidf._idf_diag = scipy.sparse.diags([idf_vector], [0])
    tfidf.vocabulary_ = tfidf.vocabulary
    return tfidf
def load_model(path=FILES_LOCATION):
    r = [0] * 6
    m = [0] * 6
    for i in range(6):
        r[i] = np.mat(np.load(path + "r" + str(i) + ".npy"))
        m[i] = joblib.load(path + "m" + str(i) + ".sav")

    # TF-IDF vectorizer
    vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenize,
                          min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                          smooth_idf=1, sublinear_tf=1)
    vec._tfidf._idf_diag = load_obj("idf_diag")  # sp.spdiags(idfs, diags = 0, m = len(idfs), n = len(idfs))
    vec.vocabulary_ = load_obj("vocabulary")
    return vec, m, r
Example #7
0
def load_model_info(model_info):
    """Return a longform model from a model info JSON object.

    Parameters
    ----------
    model_info : dict
        The JSON object containing the attributes of a model.

    Returns
    -------
    longform_model : py:class:`adeft.classify.AdeftClassifier`
        The classifier that was loaded from the given JSON object.
    """
    shortforms = model_info['shortforms']
    pos_labels = model_info['pos_labels']
    longform_model = AdeftClassifier(shortforms=shortforms,
                                     pos_labels=pos_labels)
    ngram_range = model_info['tfidf']['ngram_range']
    tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    logit = LogisticRegression(multi_class='auto')

    tfidf.vocabulary_ = model_info['tfidf']['vocabulary_']
    tfidf.idf_ = model_info['tfidf']['idf_']
    logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64')
    logit.intercept_ = np.array(model_info['logit']['intercept_'])
    logit.coef_ = np.array(model_info['logit']['coef_'])

    estimator = Pipeline([('tfidf', tfidf), ('logit', logit)])
    longform_model.estimator = estimator
    # These attributes do not exist in older adeft models.
    # For backwards compatibility we check if they are present
    if 'stats' in model_info:
        longform_model.stats = model_info['stats']
    if 'std' in model_info:
        longform_model._std = np.array(model_info['std'])
    if 'timestamp' in model_info:
        longform_model.timestamp = model_info['timestamp']
    if 'training_set_digest' in model_info:
        longform_model.training_set_digest = model_info['training_set_digest']
    if 'params' in model_info:
        longform_model.params = model_info['params']
    if 'version' in model_info:
        longform_model.version == model_info['version']
    if 'confusion_info' in model_info:
        longform_model.confusion_info = model_info['confusion_info']
    if 'other_metadata' in model_info:
        longform_model.other_metadata = model_info['other_metadata']
    return longform_model
    def from_path(cls, path, **shared):
        import numpy as np
        import scipy.sparse as sp
        from sklearn.feature_extraction.text import (TfidfTransformer,
                                                     TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        path = Path(path)

        model_path = path / "vectorizer.json"
        if not model_path.exists():
            raise LoadingError("Missing vectorizer model file: %s" %
                               model_path.name)
        with model_path.open("r", encoding="utf-8") as f:
            vectorizer_dict = json.load(f)

        vectorizer = cls(vectorizer_dict["config"], **shared)
        vectorizer._language = vectorizer_dict["language_code"]

        builtin_entity_scope = vectorizer_dict["builtin_entity_scope"]
        if builtin_entity_scope is not None:
            builtin_entity_scope = set(builtin_entity_scope)
        vectorizer.builtin_entity_scope = builtin_entity_scope

        vectorizer_ = vectorizer_dict["vectorizer"]
        if vectorizer_:
            vocab = vectorizer_["vocab"]
            idf_diag_data = vectorizer_["idf_diag"]
            idf_diag_data = np.array(idf_diag_data)

            idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
            row = list(range(idf_diag_shape[0]))
            col = list(range(idf_diag_shape[0]))
            idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                     shape=idf_diag_shape)

            tfidf_transformer = TfidfTransformer()
            tfidf_transformer._idf_diag = idf_diag

            vectorizer_ = SklearnTfidfVectorizer(
                tokenizer=lambda x: tokenize_light(x, vectorizer._language))
            vectorizer_.vocabulary_ = vocab

            vectorizer_._tfidf = tfidf_transformer

        vectorizer._tfidf_vectorizer = vectorizer_
        return vectorizer
Example #9
0
def create():
    # SamplesDatabase.set_file('samples_database.pk')

    db = SamplesDatabase.get()
    packages = db.filter(('lang', '==', 'en'))  # [:20000]

    print("creating model")

    min_df_pct = 0.002
    max_df_pct = 0.4

    min_df = int(len(packages) * min_df_pct)
    max_df = int(len(packages) * max_df_pct)

    UnStemmer.enabled = True

    tfidf_model = TfidfVectorizer(
        tokenizer=meta_data_description_tokenize,
        min_df=min_df,
        max_df=max_df,
        ngram_range=(1, 3),
        lowercase=False,  # done in tokenize function
        stop_words=get_stopwords_list())

    tfidf_model.fit(packages)

    u = UnStemmer.get()

    print("transforming data")

    tfidf_data = tfidf_model.transform(packages)

    print("saving model: ", config.TFIDFModels.description_model_2)
    tfidf_model.vocabulary_ = {
        " ".join(map(lambda x: u.resolve(x), k.split(" "))): v
        for k, v in tfidf_model.vocabulary_.items()
    }
    pickle.dump(tfidf_model, open(config.TFIDFModels.description_model_2,
                                  "wb"))
    print("saved")

    print("saving data: ", config.TFIDFModels.description_data_2)
    save_data = {'ids': packages, 'data': tfidf_data}
    pickle.dump(save_data, open(config.TFIDFModels.description_data_2, "wb"))
    print("saved")
Example #10
0
def get_tf_idf_testing(train_vocabulary, testing_set, vocabulary=None):
    # tokenize

    vectorizer = TfidfVectorizer(
        stop_words='english',
        analyzer='word',
        smooth_idf=False,
        # max_df=3000,
        # min_df=100,
        ngram_range=(1, 2),
        vocabulary=train_vocabulary,
        # tokenizer=LemmaAndStemTokenizer()
    )

    vectorizer.vocabulary_ = train_vocabulary

    tfidf = vectorizer.fit_transform(testing_set)

    return tfidf


# vectorizer = CountVectorizer(max_features=max_features, binary=binary)
# def get_tf_idf_training(training_set, vocabulary=None):
#
#     tfidf = vectorizer.fit_transform(training_set)
#
#     return tfidf, vectorizer.vocabulary_
#
#
# def get_tf_idf_testing(train_vocabulary, testing_set, vocabulary=None):
#     # tokenize
#
#     vectorizer = TfidfVectorizer(
#         stop_words='english',
#         analyzer='word',
#         vocabulary=train_vocabulary,
#         # tokenizer=LemmaAndStemTokenizer()
#     )
#
#     vectorizer.vocabulary_ = train_vocabulary
#
#     tfidf = vectorizer.fit_transform(testing_set)
#
#     return tfidf
def home():

    jobs_data = pandas.read_csv(
        'https://raw.githubusercontent.com/Nexus-404/object_detection_demo/master/data-6.csv'
    )

    query = request.form['query']

    job = list()
    link = list()
    descript = list()

    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 2),
                         min_df=0,
                         stop_words='english')
    tfidf_matrix = tf.fit_transform(jobs_data['Description'])
    pickle.dump(tfidf_matrix, open("matrix.npy", "wb"))
    pickle.dump(tf.vocabulary_, open("vocabulary.pkl", "wb"))
    pickle.dump(tf.idf_, open("idf.pkl", "wb"))
    vocabulary = pickle.load(open("vocabulary.pkl", "rb"))
    idfs = pickle.load(open("idf.pkl", "rb"))
    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 2),
                         min_df=0,
                         stop_words='english')
    tf.vocabulary_ = vocabulary
    tf.idf_ = idfs
    tfidf_matrix = np.load("matrix.npy", allow_pickle=True)
    vector = tf.transform([query])
    cos_sim = linear_kernel(tfidf_matrix, vector)
    res = cos_sim[:, 0].argsort()[:-11:-1]

    for i in res:
        job.append(jobs_data['Jobs'][i])
        link.append(jobs_data['Job Url'][i])
        descript.append(jobs_data['Description'][i])

    return render_template("/readpdf.html",
                           jobs=job,
                           links=link,
                           job_description=descript,
                           query=query)
Example #12
0
def load_model(serialization_dir):
    with open(os.path.join(args.model, "best_hyperparameters.json"), 'r') as f:
        hyperparameters = json.load(f)
    if hyperparameters.pop('stopwords') == 1:
        stop_words = 'english'
    else:
        stop_words = None
    weight = hyperparameters.pop('weight')
    if weight == 'binary':
        binary = True
    else:
        binary = False
    ngram_range = hyperparameters.pop('ngram_range')
    ngram_range = sorted([int(x) for x in ngram_range.split()])
    if weight == 'tf-idf':
        vect = TfidfVectorizer(stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    else:
        vect = CountVectorizer(binary=binary,
                               stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    with open(os.path.join(args.model, "vocab.json"), 'r') as f:
        vocab = json.load(f)
    vect.vocabulary_ = vocab
    hyperparameters['C'] = float(hyperparameters['C'])
    hyperparameters['tol'] = float(hyperparameters['tol'])
    classifier = LogisticRegression(**hyperparameters)
    if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
        vect.idf_ = np.load(
            os.path.join(serialization_dir, "archive", "idf.npy"))
    classifier.coef_ = np.load(
        os.path.join(serialization_dir, "archive", "coef.npy"))
    classifier.intercept_ = np.load(
        os.path.join(serialization_dir, "archive", "intercept.npy"))
    classifier.classes_ = np.load(
        os.path.join(serialization_dir, "archive", "classes.npy"))
    return classifier, vect
Example #13
0
def load_model_info(model_info):
    """Return a longform model from a model info JSON object.

    Parameters
    ----------
    model_info : dict
        The JSON object containing the attributes of a model.

    Returns
    -------
    longform_model : py:class:`adeft.classify.AdeftClassifier`
        The classifier that was loaded from the given JSON object.
    """
    shortforms = model_info['shortforms']
    pos_labels = model_info['pos_labels']
    longform_model = AdeftClassifier(shortforms=shortforms,
                                     pos_labels=pos_labels)
    ngram_range = model_info['tfidf']['ngram_range']
    tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    logit = LogisticRegression(multi_class='auto')

    tfidf.vocabulary_ = model_info['tfidf']['vocabulary_']
    tfidf.idf_ = model_info['tfidf']['idf_']
    logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64')
    logit.intercept_ = np.array(model_info['logit']['intercept_'])
    logit.coef_ = np.array(model_info['logit']['coef_'])

    estimator = Pipeline([('tfidf', tfidf), ('logit', logit)])
    longform_model.estimator = estimator
    # Load model statistics if they are available
    if 'stats' in model_info:
        longform_model.stats = model_info['stats']
    # Load standard deviations for calculating feature importances
    # if they are available
    if 'std' in model_info:
        longform_model._std = np.array(model_info['std'])
    return longform_model
Example #14
0
    def train_tfidf(self, tokenizer='custom', corpus='news'):

        if tokenizer == 'custom':
            tokenizer = self.tokenize

        nltk_corpus = []
        if corpus == 'all':
            nltk_corpus += [
                nltk.corpus.gutenberg.raw(f_id)
                for f_id in nltk.corpus.gutenberg.fileids()
            ]
            nltk_corpus += [
                nltk.corpus.webtext.raw(f_id)
                for f_id in nltk.corpus.webtext.fileids()
            ]
            nltk_corpus += [
                nltk.corpus.brown.raw(f_id)
                for f_id in nltk.corpus.brown.fileids()
            ]
            nltk_corpus += [
                nltk.corpus.reuters.raw(f_id)
                for f_id in nltk.corpus.reuters.fileids()
            ]
        elif corpus == 'news':
            nltk_corpus += self.get_bbc_news_corpus()

        if self.verbose:
            print "LENGTH nltk corpus corpus: {}".format(
                sum([len(d) for d in nltk_corpus]))

        vectorizer = TfidfVectorizer(max_df=1.0,
                                     min_df=2,
                                     encoding='utf-8',
                                     decode_error='strict',
                                     max_features=None,
                                     stop_words='english',
                                     ngram_range=(1, 3),
                                     norm='l2',
                                     tokenizer=tokenizer,
                                     use_idf=True,
                                     sublinear_tf=False)

        #vectorizer.fit_transform(nltk_corpus)
        vectorizer.fit(nltk_corpus)
        # Avoid having to pickle instance methods, we will set this method on on load
        vectorizer.tokenizer = None
        keys = np.array(vectorizer.vocabulary_.keys(), dtype=str)
        values = np.array(vectorizer.vocabulary_.values(), dtype=int)
        stop_words = np.array(list(vectorizer.stop_words_), dtype=str)

        with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'w') as f:
            atom = tables.Atom.from_dtype(keys.dtype)
            ds = f.createCArray(f.root, 'keys', atom, keys.shape)
            ds[:] = keys

        with tables.openFile(self.data_path + 'tfidf_values.hdf', 'w') as f:
            atom = tables.Atom.from_dtype(values.dtype)
            ds = f.createCArray(f.root, 'values', atom, values.shape)
            ds[:] = values

        with tables.openFile(self.data_path + 'tfidf_stop_words.hdf',
                             'w') as f:
            atom = tables.Atom.from_dtype(stop_words.dtype)
            ds = f.createCArray(f.root, 'stop_words', atom, stop_words.shape)
            ds[:] = stop_words

        vectorizer.vocabulary_ = None
        vectorizer.stop_words_ = None

        with open(self.data_path + 'tfidf.pkl', 'wb') as fin:
            cPickle.dump(vectorizer, fin)

        vectorizer.vocabulary_ = dict(zip(keys, values))
        vectorizer.stop_words_ = stop_words

        return vectorizer
Example #15
0
        'ngram_range': (1, 1),
        'norm': 'l2',
        'preprocessor': None,
        'smooth_idf': True,
        'stop_words': 'english',
        'strip_accents': None,
        'sublinear_tf': False,
        'token_pattern': '(?u)\\b\\w\\w+\\b',
        'tokenizer': None,
        'use_idf': True,
        'vocabulary': None
    })

vectorizer.idf_ = np.fromfile('idf.npy')
with open('vocabulary.json') as f:
    vectorizer.vocabulary_ = json.load(f)

n_features = len(vectorizer.idf_)

clf = LogisticRegression(
    **{
        'C': 1.0,
        'class_weight': None,
        'dual': False,
        'fit_intercept': True,
        'intercept_scaling': 1,
        'max_iter': 100,
        'multi_class': 'warn',
        'n_jobs': None,
        'penalty': 'l2',
        'random_state': 0,
Example #16
0
	def train_tfidf(self, tokenizer='custom', corpus='news'):

		if tokenizer == 'custom':
			tokenizer = self.tokenize

		nltk_corpus = []
		if corpus == 'all':
			nltk_corpus += [nltk.corpus.gutenberg.raw(f_id) for f_id in nltk.corpus.gutenberg.fileids()]
			nltk_corpus += [nltk.corpus.webtext.raw(f_id) for f_id in nltk.corpus.webtext.fileids()]
			nltk_corpus += [nltk.corpus.brown.raw(f_id) for f_id in nltk.corpus.brown.fileids()]
			nltk_corpus += [nltk.corpus.reuters.raw(f_id) for f_id in nltk.corpus.reuters.fileids()]
		elif corpus == 'news':
			nltk_corpus += self.get_bbc_news_corpus()

		if self.verbose:
			print "LENGTH nltk corpus corpus: {}".format(sum([len(d) for d in nltk_corpus]))


		vectorizer = TfidfVectorizer(
			max_df=1.0,
			min_df=2,
			encoding='utf-8',
			decode_error='strict',
			max_features=None,
			stop_words='english',
			ngram_range=(1, 3),
			norm='l2',
			tokenizer=tokenizer,
			use_idf=True,
			sublinear_tf=False)

		#vectorizer.fit_transform(nltk_corpus)
		vectorizer.fit(nltk_corpus)
		# Avoid having to pickle instance methods, we will set this method on on load
		vectorizer.tokenizer = None
		keys = np.array(vectorizer.vocabulary_.keys(), dtype=str)
		values = np.array(vectorizer.vocabulary_.values(), dtype=int)
		stop_words = np.array(list(vectorizer.stop_words_), dtype=str)

		with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'w') as f:
			atom = tables.Atom.from_dtype(keys.dtype)
			ds = f.createCArray(f.root, 'keys', atom, keys.shape)
			ds[:] = keys

		with tables.openFile(self.data_path + 'tfidf_values.hdf', 'w') as f:
			atom = tables.Atom.from_dtype(values.dtype)
			ds = f.createCArray(f.root, 'values', atom, values.shape)
			ds[:] = values

		with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'w') as f:
			atom = tables.Atom.from_dtype(stop_words.dtype)
			ds = f.createCArray(f.root, 'stop_words', atom, stop_words.shape)
			ds[:] = stop_words

		vectorizer.vocabulary_ = None
		vectorizer.stop_words_ = None

		with open(self.data_path + 'tfidf.pkl', 'wb') as fin:
			cPickle.dump(vectorizer, fin)

		vectorizer.vocabulary_ = dict(zip(keys, values))
		vectorizer.stop_words_ = stop_words

		return vectorizer