Esempio n. 1
0
def recommender(query):

	#load vocab and idfs and data
	#df = pd.read_csv("song_data.csv")
	vocabulary = pickle.load(open("vocabulary.pkl", "rb"))
	idfs = pickle.load(open("idf.pkl", "rb"))	
	tfidf_matrix = pickle.load(open("matrix.pkl","rb"))
	#tfidf_matrix = np.load("tfidf_matrix.npy")
	
	#reform the vectorizer
	tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
	tf.vocabulary_ = vocabulary
	tf.idf_ = idfs

	#query vector	
	vector = tf.transform([query])
	
	print(vector.shape)
	print(tfidf_matrix.shape)
	#similarity	
	cos_sim = linear_kernel(tfidf_matrix,vector)
	res = cos_sim[:,0].argsort()[:-6:-1]
	
	#prediction list
	#pred = [df['songname'][i] for i in res] 
	print(res)	
	
	return res
Esempio n. 2
0
def fdt_setup(request):
    global __model, __generator
    # create a temp model
    __generator = Faker()
    vocab = list(set(__generator.words(500)))
    vectorizer = TfidfVectorizer(vocabulary=vocab)
    vectorizer.idf_ = [randint(1, 15) for i in range(len(vocab))]
    model = {
        "vocab": vocab,
        "stop_words": list(set(__generator.words(10))),
        "intercept": [1],
        "idf_": vectorizer.idf_,
        "lr": [random() for i in range(len(vocab))],
        "vectorizer": [vectorizer]
    }
    np.savez('/tmp/model', **model)

    __model = Loader('/tmp').load_model('model')

    def teardown():
        __model = None
        __generator = None
        # delete model file
        os.remove("/tmp/model.npz")
    request.addfinalizer(teardown)
Esempio n. 3
0
    def from_pretrained(cls, normal_set="default", abb_expander="default"):

        model_dir = DEFAULT_DNORM_PATH

        if not model_dir.parent.is_dir():
            model_dir.parent.mkdir()

        if not model_dir.is_dir():
            model_dir.mkdir()

        if not (model_dir / "normal_set.txt").is_file():
            download_fileobj(BASE_URL + "/normal_set.txt",
                             model_dir / "normal_set.txt")

        if not (model_dir / "W_EHR_all.npz").is_file():
            download_fileobj(BASE_URL + "/W_EHR_all.npz",
                             model_dir / "W_EHR_all.npz")

        if not (model_dir / "EHR_idf3.pkl").is_file():
            download_fileobj(BASE_URL + "/EHR_idf3.pkl",
                             model_dir / "EHR_idf3.pkl")

        if not (model_dir / "abb_dic.csv").is_file():
            download_fileobj(BASE_URL + "/abb_dic.csv",
                             model_dir / "abb_dic.csv")

        mecab = MeCab.Tagger('-Owakati')
        tokenizer = Tokenizer(mecab.parse, lambda s: s[:-1])

        if abb_expander == "default":
            converter = Converter(str(model_dir / "abb_dic.csv")).convert
        elif isinstance(abb_expander, str):
            converter = Converter(abb_expander).convert
        elif callable(abb_expander):
            converter = abb_expander
        else:
            converter = None

        if normal_set == "default":
            normal_set = str(model_dir / "normal_set.txt")

        normal_set = load_normal_set(normal_set)

        tfidf = TfidfVectorizer(analyzer=lambda s: s.split(' '))

        with open(str(model_dir / "EHR_idf3.pkl"), 'rb') as f:
            params = pickle.load(f)
        tfidf.set_params(**params['params'])
        tfidf.vocabulary_ = params['voc']
        tfidf.idf_ = params['idf']
        """
        with open(str(model_dir / "EHR_idf.pkl"), 'rb') as f:
            tfidf = pickle.load(f)
        """

        model = cls(tfidf, normal_set, tokenizer.tokenizer, converter)
        model.load(str(model_dir / "W_EHR_all.npz"))

        return model
Esempio n. 4
0
def test_tfidf_vectorizer_setter():
    orig = TfidfVectorizer(use_idf=True)
    orig.fit(JUNK_FOOD_DOCS)
    copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True)
    copy.idf_ = orig.idf_
    assert_array_equal(
        copy.transform(JUNK_FOOD_DOCS).toarray(),
        orig.transform(JUNK_FOOD_DOCS).toarray())
Esempio n. 5
0
def test_tfidf_vectorizer_setter():
    orig = TfidfVectorizer(use_idf=True)
    orig.fit(JUNK_FOOD_DOCS)
    copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True)
    copy.idf_ = orig.idf_
    assert_array_equal(
        copy.transform(JUNK_FOOD_DOCS).toarray(),
        orig.transform(JUNK_FOOD_DOCS).toarray())
Esempio n. 6
0
File: q53.py Progetto: simaki/nlp100
def get_fitted_tv():
    with open(project_path / "output/news/tv_vocabulary_.txt") as f:
        vocabulary_ = {
            line.split()[0]: int(line.split()[1])
            for line in f.readlines()
        }
    with open(project_path / "output/news/tv_stop_words_.txt") as f:
        stop_words_ = list(f.readlines())
    idf_ = np.loadtxt(project_path / "output/news/tv_idf_.txt")

    tv = TfidfVectorizer(stop_words=stop_words_, vocabulary=vocabulary_)
    tv.idf_ = idf_

    return tv
Esempio n. 7
0
def load_model_info(model_info):
    """Return a longform model from a model info JSON object.

    Parameters
    ----------
    model_info : dict
        The JSON object containing the attributes of a model.

    Returns
    -------
    longform_model : py:class:`adeft.classify.AdeftClassifier`
        The classifier that was loaded from the given JSON object.
    """
    shortforms = model_info['shortforms']
    pos_labels = model_info['pos_labels']
    longform_model = AdeftClassifier(shortforms=shortforms,
                                     pos_labels=pos_labels)
    ngram_range = model_info['tfidf']['ngram_range']
    tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    logit = LogisticRegression(multi_class='auto')

    tfidf.vocabulary_ = model_info['tfidf']['vocabulary_']
    tfidf.idf_ = model_info['tfidf']['idf_']
    logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64')
    logit.intercept_ = np.array(model_info['logit']['intercept_'])
    logit.coef_ = np.array(model_info['logit']['coef_'])

    estimator = Pipeline([('tfidf', tfidf), ('logit', logit)])
    longform_model.estimator = estimator
    # These attributes do not exist in older adeft models.
    # For backwards compatibility we check if they are present
    if 'stats' in model_info:
        longform_model.stats = model_info['stats']
    if 'std' in model_info:
        longform_model._std = np.array(model_info['std'])
    if 'timestamp' in model_info:
        longform_model.timestamp = model_info['timestamp']
    if 'training_set_digest' in model_info:
        longform_model.training_set_digest = model_info['training_set_digest']
    if 'params' in model_info:
        longform_model.params = model_info['params']
    if 'version' in model_info:
        longform_model.version == model_info['version']
    if 'confusion_info' in model_info:
        longform_model.confusion_info = model_info['confusion_info']
    if 'other_metadata' in model_info:
        longform_model.other_metadata = model_info['other_metadata']
    return longform_model
def home():

    jobs_data = pandas.read_csv(
        'https://raw.githubusercontent.com/Nexus-404/object_detection_demo/master/data-6.csv'
    )

    query = request.form['query']

    job = list()
    link = list()
    descript = list()

    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 2),
                         min_df=0,
                         stop_words='english')
    tfidf_matrix = tf.fit_transform(jobs_data['Description'])
    pickle.dump(tfidf_matrix, open("matrix.npy", "wb"))
    pickle.dump(tf.vocabulary_, open("vocabulary.pkl", "wb"))
    pickle.dump(tf.idf_, open("idf.pkl", "wb"))
    vocabulary = pickle.load(open("vocabulary.pkl", "rb"))
    idfs = pickle.load(open("idf.pkl", "rb"))
    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 2),
                         min_df=0,
                         stop_words='english')
    tf.vocabulary_ = vocabulary
    tf.idf_ = idfs
    tfidf_matrix = np.load("matrix.npy", allow_pickle=True)
    vector = tf.transform([query])
    cos_sim = linear_kernel(tfidf_matrix, vector)
    res = cos_sim[:, 0].argsort()[:-11:-1]

    for i in res:
        job.append(jobs_data['Jobs'][i])
        link.append(jobs_data['Job Url'][i])
        descript.append(jobs_data['Description'][i])

    return render_template("/readpdf.html",
                           jobs=job,
                           links=link,
                           job_description=descript,
                           query=query)
Esempio n. 9
0
def load_model(serialization_dir):
    with open(os.path.join(args.model, "best_hyperparameters.json"), 'r') as f:
        hyperparameters = json.load(f)
    if hyperparameters.pop('stopwords') == 1:
        stop_words = 'english'
    else:
        stop_words = None
    weight = hyperparameters.pop('weight')
    if weight == 'binary':
        binary = True
    else:
        binary = False
    ngram_range = hyperparameters.pop('ngram_range')
    ngram_range = sorted([int(x) for x in ngram_range.split()])
    if weight == 'tf-idf':
        vect = TfidfVectorizer(stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    else:
        vect = CountVectorizer(binary=binary,
                               stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    with open(os.path.join(args.model, "vocab.json"), 'r') as f:
        vocab = json.load(f)
    vect.vocabulary_ = vocab
    hyperparameters['C'] = float(hyperparameters['C'])
    hyperparameters['tol'] = float(hyperparameters['tol'])
    classifier = LogisticRegression(**hyperparameters)
    if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
        vect.idf_ = np.load(
            os.path.join(serialization_dir, "archive", "idf.npy"))
    classifier.coef_ = np.load(
        os.path.join(serialization_dir, "archive", "coef.npy"))
    classifier.intercept_ = np.load(
        os.path.join(serialization_dir, "archive", "intercept.npy"))
    classifier.classes_ = np.load(
        os.path.join(serialization_dir, "archive", "classes.npy"))
    return classifier, vect
Esempio n. 10
0
def load_model_info(model_info):
    """Return a longform model from a model info JSON object.

    Parameters
    ----------
    model_info : dict
        The JSON object containing the attributes of a model.

    Returns
    -------
    longform_model : py:class:`adeft.classify.AdeftClassifier`
        The classifier that was loaded from the given JSON object.
    """
    shortforms = model_info['shortforms']
    pos_labels = model_info['pos_labels']
    longform_model = AdeftClassifier(shortforms=shortforms,
                                     pos_labels=pos_labels)
    ngram_range = model_info['tfidf']['ngram_range']
    tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    logit = LogisticRegression(multi_class='auto')

    tfidf.vocabulary_ = model_info['tfidf']['vocabulary_']
    tfidf.idf_ = model_info['tfidf']['idf_']
    logit.classes_ = np.array(model_info['logit']['classes_'], dtype='<U64')
    logit.intercept_ = np.array(model_info['logit']['intercept_'])
    logit.coef_ = np.array(model_info['logit']['coef_'])

    estimator = Pipeline([('tfidf', tfidf), ('logit', logit)])
    longform_model.estimator = estimator
    # Load model statistics if they are available
    if 'stats' in model_info:
        longform_model.stats = model_info['stats']
    # Load standard deviations for calculating feature importances
    # if they are available
    if 'std' in model_info:
        longform_model._std = np.array(model_info['std'])
    return longform_model
Esempio n. 11
0
def predict_label(signatures, prob3_path, vocabulary_path, idf_path,
                  cluster_centers_path, clasnum2labels_path):
    func_names = []
    func_types = []
    func_comments = []
    for name, types, comments in (
            x for x in map(lambda l: l.strip().split("|"), signatures)):
        func_types.append(types.split(","))
        func_comments.append(comments.split(","))
        func_names.append(name)

    def tokenize_funcs(funcs):
        oneword = re.compile(r"^[a-z][a-z0-9]+|[A-Z][A-Z0-9]$")
        difCase = re.compile(
            r".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)")
        under_scores_split = re.compile(r"_")

        tokenized_funcs = []
        for f in funcs:
            if oneword.fullmatch(f):
                tokenized_funcs.append([f])
            elif "_" in f:
                tokenized_funcs.append(
                    [w for w in under_scores_split.split(f) if w])
            else:
                tokenized_funcs.append(
                    [w.group(0) for w in difCase.finditer(f) if w.group(0)])
        return tokenized_funcs

    tokenized_func_names = tokenize_funcs(func_names)
    tokenized_func_names = [
        tok_name + tok_comm
        for tok_name, tok_comm in zip(tokenized_func_names, func_comments)
    ]

    def drop_wrong_symbols(tokenized_func_names):
        # first approach to drop all digits, second only if > 50%
        wrong_char = re.compile(r"[\d]")
        tokenized_func_names_ = []
        for tokenized_name in tokenized_func_names:
            processed_tokens = [
                wrong_char.sub("", token).lower() for token in tokenized_name
                if wrong_char.sub("", token)
            ]
            tokenized_func_names_.append(processed_tokens)

        return tokenized_func_names_

    tokenized_func_names = drop_wrong_symbols(tokenized_func_names)

    with open(prob3_path, "rb") as f:
        prob3 = pickle.load(f)

    def split(word, start=1, end=20):
        return ((word[:i], word[i:])
                for i in range(start, min(len(word) + 1, end)))

    @functools.lru_cache(maxsize=10000)
    def segment(word, maxlen=500):
        if not word:
            return []
        if len(word) > maxlen:
            return segment(word[:maxlen]) + segment(word[maxlen:])
        candidates = ([first] + segment(remaining)
                      for first, remaining in split(word))
        return max(
            candidates,
            key=lambda x: functools.reduce(operator.__mul__, map(prob3, x), 1))

    def segmentize_corpus(tokenized_func_names, segmenter):
        tokenized_func_names = [
            list(it.chain(*(segmenter(token) for token in tokens)))
            for tokens in tokenized_func_names
        ]
        return tokenized_func_names

    tokenized_func_names = segmentize_corpus(tokenized_func_names, segment)

    def lemmatize_corpus(tokenized_func_names):
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        lengths = np.cumsum([0] + list(map(len, tokenized_func_names)))
        flat_tokens = list(it.chain(*tokenized_func_names))
        doc = spacy.tokens.Doc(nlp.vocab, words=flat_tokens)
        tokenized_func_names = [[
            token.lemma_ for token in doc[lengths[i - 1]:lengths[i]]
        ] for i in range(1,
                         len(tokenized_func_names) + 1)]
        return tokenized_func_names

    tokenized_func_names = lemmatize_corpus(tokenized_func_names)

    with open(vocabulary_path, "rb") as f:
        vocab = pickle.load(f)

    def prune_names(tokenized_func_names, vocab):
        tokenized_func_names_ = []
        for tokenized_name in tokenized_func_names:
            processed_tokens = [
                token for token in tokenized_name if token in vocab
            ]
            tokenized_func_names_.append(processed_tokens)
        return tokenized_func_names_

    tokenized_func_names = prune_names(tokenized_func_names, set(vocab.keys()))

    def tokenize_types(func_types):
        type_set = {
            "int", "unsigned int", "char", "unsigned char", "enum", "struct",
            "void", "long", "unsigned long", "float", "double", "short",
            "unsigned short", "bool", "union", "long long",
            "unsigned long long"
        }
        type_dict = {re.compile(t): t for t in type_set}
        re_drop = re.compile(r"\*|restrict|const")
        struct_type = re.compile("struct")
        tokenized_types = [[0 for _ in range(len(f_types))]
                           for f_types in func_types]
        for i, f_types in enumerate(func_types):
            for j, type in enumerate(f_types):
                cleaned_type = re_drop.sub("", type)
                for re_t, t in type_dict.items():
                    if re.search(re_t, cleaned_type):
                        tokenized_types[i][j] = t
                        break
                else:
                    tokenized_types[i][j] = type_dict[struct_type]
        return tokenized_types

    tokenized_func_types = tokenize_types(func_types)

    tokenized_features = [
        tok_name + tok_types for tok_name, tok_types in zip(
            tokenized_func_names, tokenized_func_types)
    ]
    idf = np.load(idf_path)
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x,
                                       lowercase=False,
                                       sublinear_tf=True,
                                       vocabulary=vocab)
    tfidf_vectorizer.idf_ = idf
    tfidf_matrix = tfidf_vectorizer.transform(tokenized_features)

    centers = np.load(cluster_centers_path)
    model = KMeans(centers.shape[0])
    model._n_threads = 1
    model.cluster_centers_ = centers
    cluster_nums = model.predict(tfidf_matrix)

    with open(clasnum2labels_path, "rb") as f:
        clasnum2labels = pickle.load(f)

    labels = [clasnum2labels[c] for c in cluster_nums]
    return {n: l for n, l in zip(func_names, labels) if l != "unknown"}
Esempio n. 12
0
        'max_features': None,
        'min_df': 1,
        'ngram_range': (1, 1),
        'norm': 'l2',
        'preprocessor': None,
        'smooth_idf': True,
        'stop_words': 'english',
        'strip_accents': None,
        'sublinear_tf': False,
        'token_pattern': '(?u)\\b\\w\\w+\\b',
        'tokenizer': None,
        'use_idf': True,
        'vocabulary': None
    })

vectorizer.idf_ = np.fromfile('idf.npy')
with open('vocabulary.json') as f:
    vectorizer.vocabulary_ = json.load(f)

n_features = len(vectorizer.idf_)

clf = LogisticRegression(
    **{
        'C': 1.0,
        'class_weight': None,
        'dual': False,
        'fit_intercept': True,
        'intercept_scaling': 1,
        'max_iter': 100,
        'multi_class': 'warn',
        'n_jobs': None,