class Preprocessor_controller():
    def __init__(self, vectorizer_path, device='cpu'):
        self.vectorizer_path = vectorizer_path
        self.device = device

        self.__init_stemmer()
        self.__init_vectorizer()

    def get_vectorizer_vocab_size(self):
        return len(self.vectorizer.vocabulary)

    def __init_stemmer(self):
        nltk.download('stopwords')

        stop = set(stopwords.words('english'))
        punctuation = list(string.punctuation)
        stop.update(punctuation)

        self.stopwords = stop
        self.stemmer = PorterStemmer()

    def __init_vectorizer(self):
        pickle_model = Pickle_model()

        vocabulary = pickle_model.get_pickle(path=self.vectorizer_path)
        self.vectorizer = CountVectorizer(stop_words='english', vocabulary=vocabulary)
        self.vectorizer._validate_vocabulary()

    def __stem_text(self, text):
        final_text = [self.stemmer.stem(word.strip()) for word in text.split() if word.strip().lower() not in self.stopwords]

        return " ".join(final_text)

    def __vectorize_text(self, text):
        return self.vectorizer.transform(text)

    def __csr_to_tensor(self, csr):
        coo = csr.tocoo()
        values = coo.data
        indices = np.vstack((coo.row, coo.col))

        i = torch.LongTensor(indices)
        v = torch.FloatTensor(values)
        shape = coo.shape

        return torch.sparse.FloatTensor(i, v, torch.Size(shape))

    def transform(self, text_articles, article_titles):
        text = text_articles if isinstance(text_articles, list) else [text_articles]
        titles = article_titles if isinstance(article_titles, list) else [article_titles]

        inputs = [title + " " + text for (title, text) in zip(titles, text)]

        stemmed_text = [self.__stem_text(text) for text in inputs]
        print("Stemmed_text: {}".format(stemmed_text))
        vector = self.__vectorize_text(stemmed_text)
        print("Vector: {}".format(vector))
        tensor = self.__csr_to_tensor(vector)
        print("Tensor: {}".format(tensor))
        return tensor.to(self.device)
Exemple #2
0
def app():
    st.title('Insert your insepection text and make predictions')
    
    st.header('Do tou want to know risk category of inspected restaruant?')
    
    text = st.text_area('Insert here your violation description to know the predcited risk category (short text, suggested max 50 chars):', max_chars=50)
    
    st.write('You have inserted the following text: ', text)
    if text != '':
        with open('source/dict', "rb") as f:
            vocabulary_to_load = pickle.load(f)
            loaded_vectorizer = CountVectorizer(vocabulary=vocabulary_to_load)
            loaded_vectorizer._validate_vocabulary()
            

            with open('source/risk_prediction.pkl', 'rb') as file:
                model = pickle.load(file)

                prediction = model.predict(loaded_vectorizer.transform([text]))  
                if prediction == 1.0:
                    output='Low Risk' 
                elif prediction == 2.0:
                    output='Moderate Risk' 
                else:
                    output='High Risk' 
                st.write('Predicted RISK CATEGORY: ',output)
                if prediction == 1.0:
                    st.image(green, use_column_width=True)
                elif prediction == 2.0:
                    st.image(orange, use_column_width=True,width=70) 
                else:
                    st.image(red, use_column_width=True)
    else:
        st.image(image, use_column_width=True)
    def transform(self):
        logging.info('NewsgroupsData transform')

        self._y = self._newsgroups.target

        word_count_vectorizer = CountVectorizer(vocabulary=self._dictionary,
                                                stop_words='english')
        word_count_vectorizer._validate_vocabulary()

        # feature selection
        # To saving calc time, this only works on a default train data
        # (not the train data at each cv fold)
        x_word_count_train = word_count_vectorizer.transform(
            self._newsgroups_train.data).toarray()
        feature_scorer = WordFeaturesScore(x_word_count_train, self._y)
        feature_scorer.scorer(self.feature_score)

        # normalize data
        x_word_count = word_count_vectorizer.transform(
            self._newsgroups.data).toarray()
        normalizer = DocumentNormalizer(
            feature_scorer.get_rank_ids(self.feature_number))
        self._x = normalizer.normalize(self.normalize, x_word_count,
                                       **self.kwargs_normalize)

        # binary class binding
        positive_label_indices = {
            self._class_names.index(label)
            for label in self.positive_labels
        }
        self._y = np.fromiter((x in positive_label_indices for x in self._y),
                              dtype=int)

        return self
Exemple #4
0
def classify_reviews():
	"""
	Classify all the reviews
	"""
	data = request.get_json(force=True)

	clf, vocabulary = load_clf_and_vocabulary(data['classifier'], data['vocabModel'], data['tfIdf'], False)
	vect = CountVectorizer(vocabulary=vocabulary)
	vect._validate_vocabulary()

	y = np.array(data['ratings'])

	if data['classifier'] == 'LR':
		X = vect.transform(data['reviews'])
		y_pred = clf.predict(X)
	elif data['classifier'] == 'SVM' or data['classifier'] == 'MLP':
		y_pred = clf.predict(data['reviews'])

	accuracy = accuracy_score(y, y_pred)

	print('Accuracy:', accuracy * 100)

	return jsonify({
		'accuracy': accuracy,
		'predicted_label': y_pred.tolist()
	})
def loadvoacb(vectusing, modelusing, pickfrom, sample_text):
    '''
    function to load the trained voab
    arguments: 
      vectusing:choice of vectorizer
      modelusing: choice of model that needs to be applied
      pickfrom: pickled model to apply for new text for topic modelling
      sample_text: text for topic modelling
    returns: sample text given for topic modelling and the predicted label
    '''

    # assign models pickled filename to filename for accessing based on model parameter
    if modelusing == 'LRL1':
        modelfile = 'LogisticRegressionL1.pkl'
    elif modelusing == 'LRL2':
        modelfile = 'LogisticRegressionL2.pkl'
    elif modelusing == 'NB':
        modelfile = 'NaiveBayes.pkl'
    elif modelusing == 'RF':
        modelfile = 'RandomForest.pkl'

    #create filename using modelpath and modelfile
    filename = pickfrom + modelfile
    print(filename)
    # Load the model from the file
    model_from_joblib = joblib.load(filename)

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()

    data = config.get('Interim', 'Interim1')
    print('Pick cleaned data from : ', data)

    saveto = config.get('Modelpath', 'saveto')
    print('Save model to : ', saveto)

    # creates and pickles trained vocabulary
    new_vectorizer(data, vectusing)
    if vectusing == 'CV':
        filename = saveto + vectusing
        trainedvectvoacb = pickle.load(open(filename, 'rb'))
        #reloading trained vocabulary
        loadedvect = CountVectorizer(vocabulary=trainedvectvoacb)
    elif vectusing == 'TFIDF':
        filename = saveto + vectusing
        trainedvectvoacb = pickle.load(open(filename, 'rb'))
        #reloading trained vocabulary
        loadedvect = TfidfVectorizer(vocabulary=trainedvectvoacb)

    # validating picled vocabulary
    loadedvect._validate_vocabulary()
    # transform the new text using the loaded vocabulary (for applying the trained model)
    newtestvect = loadedvect.transform(sample_text)

    # Use the loaded model to make predictions for sample test
    label_predictions = model_from_joblib.predict(newtestvect)
    print(label_predictions, sample_text)
    return (label_predictions, sample_text)
def get_information(comment):
    L2  = ['N', 'S']
    vocab = pickle.load(open('MBTI_Vocab.pkl', 'rb'))
    loaded_vectorizer = CountVectorizer(vocabulary=vocab)
    loaded_vectorizer._validate_vocabulary()
    logreginfo = pickle.load(open('information.pkl', 'rb'))
    X = comment
    array = loaded_vectorizer.fit_transform([X]).toarray()
    information = logreginfo.predict(array)
    return "N" if information.tolist().pop()==0 else "S"
def get_decision(comment):
    L3  = ['T', 'F']
    vocab = pickle.load(open('MBTI_Vocab.pkl', 'rb'))
    loaded_vectorizer = CountVectorizer(vocabulary=vocab)
    loaded_vectorizer._validate_vocabulary()
    logregdecision = pickle.load(open('decision.pkl', 'rb'))
    X = comment
    array = loaded_vectorizer.fit_transform([X]).toarray()
    decision = logregdecision.predict(array)
    return "T" if decision.tolist().pop()==0 else "F"
def get_structure(comment):
    L4  = ['J', 'P']
    vocab = pickle.load(open('MBTI_Vocab.pkl', 'rb'))
    loaded_vectorizer = CountVectorizer(vocabulary=vocab)
    loaded_vectorizer._validate_vocabulary()
    logregstructure = pickle.load(open('structure.pkl', 'rb'))
    X = comment
    array = loaded_vectorizer.fit_transform([X]).toarray()
    structure = logregstructure.predict(array)
    return "J" if structure.tolist().pop()==0 else "P"
def get_favorite_world(comment):
    L1 = ['I', 'E']
    vocab = pickle.load(open('MBTI_Vocab.pkl', 'rb'))
    loaded_vectorizer = CountVectorizer(vocabulary=vocab)
    loaded_vectorizer._validate_vocabulary()
    logregfw = pickle.load(open('favoriteworld.pkl', 'rb'))
    X = comment
    array = loaded_vectorizer.fit_transform([X]).toarray()
    favoriteworld = logregfw.predict(array)
    return "I" if favoriteworld.tolist().pop()==0 else "E"
Exemple #10
0
def ValuePredictor(to_predict_list):
    to_predict = (to_predict_list)
    vocabulary_to_load = pickle.load(open("./model_vocab/vocab.pickle", 'rb'))
    count_vect = CountVectorizer(vocabulary=vocabulary_to_load)
    load_model = pickle.load(open("./model_vocab/model.pickle", 'rb'))
    count_vect._validate_vocabulary()
    tfidf_transformer = tf_idf(categories)[0]
    X_new_counts = count_vect.transform([to_predict])
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = load_model.predict(X_new_tfidf)
    return predicted[0]
def fetch_values_vectorized(subset="train", data_home=None):
    data_home = get_data_home(data_home=data_home)
    filebase = 'values_vectorized'
    target_file = _pkl_filepath(data_home, filebase + ".pkl")

    data_train = fetch_values(data_home=data_home,
                                    subset='train',
                                    categories=None,
                                    shuffle=True,
                                    random_state=12)

    data_test = fetch_values(data_home=data_home,
                                   subset='test',
                                   categories=None,
                                   shuffle=True,
                                   random_state=12)

    if os.path.exists(target_file):
        X_train, X_test = joblib.load(target_file)
    else:
        spanish_stemmer = SnowballStemmer('spanish')
        non_words = list(punctuation)  
        non_words.extend(['¿', '¡'])  
        non_words.extend(map(str,range(10)))
        spanish_stopwords = stopwords.words('spanish')
        vectorizer = CountVectorizer(dtype=np.int16, lowercase=True, stop_words=spanish_stopwords, strip_accents=unicode)
        vectorizer._validate_vocabulary()
        X_train = vectorizer.fit_transform(data_train.data).tocsr()
        X_test = vectorizer.transform(data_test.data).tocsr()
        joblib.dump((X_train, X_test), target_file, compress=9)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)
    normalize(X_train, copy=False)
    normalize(X_test, copy=False)

    target_names.extend(data_train.target_names)

    if subset == "train":
        data = X_train
        target = data_train.target
    elif subset == "test":
        data = X_test
        target = data_test.target
    elif subset == "all":
        data = sp.vstack((X_train, X_test)).tocsr()
        target = np.concatenate((data_train.target, data_test.target))
    else:
        raise ValueError("%r is not a valid subset: should be one of "
                         "['train', 'test', 'all']" % subset)

    return Bunch(data=data, target=target, target_names=target_names)
def predict(data):

    # Load trained model
    file_name = "trained_model.sav"
    trained_model = pickle.load(open(file_name, 'rb'))

    # Load vocabulary
    dictionary_filepath = "vocabulary.sav"
    loaded_vocabulary = pickle.load(open(dictionary_filepath, 'rb'))

    count_vect = CountVectorizer(vocabulary=loaded_vocabulary)
    count_vect._validate_vocabulary()

    return trained_model.predict(count_vect.transform([data]))[0]
Exemple #13
0
def merge_vectorizers(vectorizers, preprocessor, tokenizer):
    merged_voca = set(
        word for vectorizer in vectorizers for word in vectorizer.vocabulary_.keys()
    )
    v = CountVectorizer(
        input="filename",
        preprocessor=preprocessor,
        tokenizer=tokenizer,
        vocabulary=dict(
            (word, idx) for word, idx in zip(merged_voca, range(len(merged_voca)))
        ),
    )
    v._validate_vocabulary()
    return v
Exemple #14
0
def lda(data, name: str):

    tf_ModelPath = os.path.join('usermodel/' + name + '/model',
                                'tfVector.model')  # 保存词频模型
    lda_ModelPath = os.path.join('usermodel/' + name + '/model',
                                 'ldaModels.model')  # 保存训练的lda模型
    bestModelPath = os.path.join('usermodel/' + name + '/model',
                                 'bestLDAModel.model')
    tf_vectorizer = CountVectorizer(
        max_df=0.95,
        min_df=2,
    )
    tf = tf_vectorizer.fit_transform(data)

    lda_models = []
    for idx, n_topic in enumerate(n_topics):
        lda = LatentDirichletAllocation(n_topics=n_topic,
                                        max_iter=8000,
                                        learning_method='batch',
                                        evaluate_every=200,
                                        perp_tol=0.01)
        t0 = time()
        lda.fit(tf)
        perplexityLst[idx] = lda.perplexity(tf)
        lda_models.append(lda)
    print("# of Topic: %d, " % n_topics[idx], end=' ')
    print("done in %0.3fs, N_iter %d, " % ((time() - t0), lda.n_iter_),
          end=' ')
    print("Perplexity Score %0.3f" % perplexityLst[idx])

    # 打印最佳模型
    best_index = perplexityLst.index(min(perplexityLst))
    best_n_topic = n_topics[best_index]
    best_model = lda_models[best_index]
    print("Best # of Topic: ", best_n_topic)
    print("Best Model: ")

    # 保存每个n_topics下的LDA模型,以便后续查看使用
    joblib.dump(tf_vectorizer, tf_ModelPath)
    joblib.dump(lda_models, lda_ModelPath)
    joblib.dump(best_model, bestModelPath)

    # 保存并输出topic_word矩阵
    print("#########Topic-Word Distribution#########")
    tf_vectorizer._validate_vocabulary()
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(best_model, tf_feature_names, n_top_words, name)
    #print(docres)
    #joblib.dump(tf_vectorizer, tf_ModelPath)
    return best_model, tf_vectorizer
Exemple #15
0
def svd2():
    req_data = request.get_json()
    textDocument = req_data['textDocument']
    text = stemDocument(textDocument)
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(text).toarray()
    count_vectorizer._validate_vocabulary()
    featurenames = count_vectorizer.get_feature_names()
    tfidf = TfidfTransformer()
    tfidfMatrix = tfidf.fit_transform(data)
    svd = TruncatedSVD(n_components=30)
    svdMatrix = svd.fit_transform(tfidfMatrix)
    print(svdMatrix)
    return json.dumps(svdMatrix.tolist())
Exemple #16
0
def predict():
    size = 1
    model = open("./saved_models/model.pkl", 'rb')
    clf = joblib.load(model)
    vocab = open("./saved_models/vocab.pkl", 'rb')
    vocabulary = joblib.load(vocab)
    loaded_vectorizer = CountVectorizer(ngram_range=(size, size),
                                        min_df=1,
                                        vocabulary=vocabulary)
    loaded_vectorizer._validate_vocabulary()
    message = request.form['message']
    data = [message]
    vect = loaded_vectorizer.transform(data).toarray()
    prediction = clf.predict(vect)
    return render_template('result.html', prediction=prediction)
def predict():
    ngram_size = 1
    nb_spam_model = open("./pretrainedModel/NB_spam_model.pkl", 'rb')
    clf = joblib.load(nb_spam_model)

    dictionary_filepath = open("./pretrainedModel/vocab.pkl", 'rb')
    vocabulary_to_load = joblib.load(dictionary_filepath)
    loaded_vectorizer = CountVectorizer(ngram_range=(ngram_size, ngram_size),
                                        min_df=1,
                                        vocabulary=vocabulary_to_load)
    loaded_vectorizer._validate_vocabulary()

    message = request.form['message']
    data = [message]
    vect = loaded_vectorizer.transform(data).toarray()
    my_prediction = clf.predict(vect)
    return render_template('result.html', prediction=my_prediction)
Exemple #18
0
def main():
    corpus = open("corpus/wikicorpus.txt", "r", encoding='UTF-8')
    articles_str = ""
    for line in corpus:
        if re.search(r'<article name="', line):
            no_tags = re.sub(r'<article name="', "", line)
            no_tags_2 = re.sub(r'">', "", no_tags)
            articles_str += no_tags_2
        else:
            articles_str += line
    global articles

    articles = articles_str.split("</article>")
    articles.pop()

    cv = CountVectorizer(lowercase=True, binary=True)
    cv._validate_vocabulary()
    sparse_matrix = cv.fit_transform(articles)
    global terms
    terms = cv.get_feature_names()
    global sparse_td_matrix
    sparse_td_matrix = sparse_matrix.T.tocsr()

    global d
    d = {
        "and": "&",
        "AND": "&",
        "or": "|",
        "OR": "|",
        "not": "1 -",
        "NOT": "1 -",
        "(": "(",
        ")": ")"
    }  # operator replacements

    global t2i
    t2i = cv.vocabulary_

    inp = input("Search for a document: ")  # asks user for input
    while inp != '':
        if check_for_unknown_words(inp):
            retrieve_articles(inp)
            inp = input("Search for another document: ")
        else:
            inp = input("Search for another document: ")
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'input_fpath',
        nargs='?',
        help=
        'Filepath of pandas pickled dataframe to be processed. Saves to the same filename.'
    )
    args = parser.parse_args()

    # I/O
    posts_path = args.input_fpath
    print("Loading data...")
    data = pd.read_pickle(posts_path)

    # Build vectorizers
    print("Building vectorizers...")
    vectorizers = []

    ## Get all punct bigrams
    bigram_char_vectorizer = CountVectorizer(analyzer='word',
                                             ngram_range=(2, 2))
    bigram_char_vectorizer.fit(data['post_body_no_blognames'])
    punct_bigrams = [
        ngram for ngram in bigram_char_vectorizer.vocabulary_
        if all(w == ' ' or w in punctuation for w in ngram)
    ]

    bigram_char_vectorizer = CountVectorizer(analyzer='word',
                                             ngram_range=(2, 2),
                                             vocabulary=punct_bigrams)
    bigram_char_vectorizer._validate_vocabulary()

    vectorizers.append(bigram_char_vectorizer)

    # Extracting style features
    #data['style_features'] = list(map(extract_style_features, tqdm(data['post_body_no_blognames'])))
    print("Extracting features...")
    data['style_features'] = [
        extract_style_features(text, vectorizers)
        for text in tqdm(data['post_body_no_blognames'].tolist())
    ]
    data.to_pickle(posts_path)
Exemple #20
0
def explain_model_weights():
	"""
	Explain the weights/parameters of a certain model
	"""
	data = request.get_json(force=True)

	# Use the original documents, not the corrected ones
	target_names = ['negative', 'neutral', 'positive', 'very_negative', 'very_positive']
	clf, vocabulary = load_clf_and_vocabulary(data['classifier'], data['vocabModel'], data['tfIdf'], data['corrected'])
	vect = CountVectorizer(vocabulary=vocabulary)
	vect._validate_vocabulary()

	if data['classifier'] == 'LR':
		explanation = explain_weights.explain_linear_classifier_weights(clf, vec=vect, target_names=target_names)
		div = html.format_as_html(explanation, include_styles=False)
		style = html.format_html_styles()

	return jsonify({
			'div': div,
			'style': style
		})
Exemple #21
0
def load_data():

    # Loading fake new headlines file
    real_file = open("data/clean_real.txt", "r")
    fake_file = open("data/clean_fake.txt", "r")
    real_titles = real_file.read().split("\n")
    fake_titles = fake_file.read().split("\n")

    all_titles = real_titles + fake_titles

    true_labels = numpy.ones(len(real_titles))
    false_labels = numpy.zeros(len(fake_titles))

    all_labels = []
    all_labels.extend(true_labels)
    all_labels.extend(false_labels)

    vectorizer = CountVectorizer()
    vectorizer._validate_vocabulary()
    matrix = vectorizer.fit_transform(all_titles)
    #This gets all occuring words from the data set into a list
    features = vectorizer.get_feature_names()

    #Split up the data randomly into 70% of training set, 15% of validation set
    #and 15% of testing set.
    titles_train, titles_test, labels_train, labels_test = train_test_split(
        matrix, all_labels, test_size=0.3, random_state=13)
    titles_test, titles_validation, labels_test, labels_validation = train_test_split(
        titles_test, labels_test, test_size=0.3, random_state=23)

    data_set = {
        "training_headlines": titles_train,
        "validation_headlines": titles_validation,
        "testing_headlines": titles_test,
        "training_labels": labels_train,
        "validation_labels": labels_validation,
        "testing_labels": labels_test,
        "features": features
    }
    return data_set
Exemple #22
0
def main():
    if flask.request.method == 'GET':
        return render_template('main.html')

    if flask.request.method == 'POST':
        #answer = ''
        exp_s = flask.request.form['tweet']
        exp = tweetprepared.tw_full_preprocess(exp_s)
        count_vect = CountVectorizer(analyzer='word',
                                     encoding='cp1251',
                                     vocabulary=vocabulary)
        count_vect._validate_vocabulary()
        exp_vect_2 = count_vect.transform([exp])
        tfidf_transformer = TfidfTransformer(use_idf=False)
        exp_vect_2_tfidf = tfidf_transformer.transform(exp_vect_2)
        predict = model_mnb.predict(exp_vect_2_tfidf)
        if predict[0] == 0:
            answer = 'Негативное'
        else:
            answer = 'Позитивное'
        #temp = predict[0]
        return render_template('main.html', tweet=exp_s, result=answer)
Exemple #23
0
def eval_cosine_similarity(actual, predicted):
    '''
    cosine similarity based on tf
    Vectorise based on a vocab that is a combination of actual summary and predicted summary
    :return: float (avg. cosine similarity across all documents)
    '''

    cos_sim = 0

    for i in range(len(actual)):
        vocabulary = actual[i] + ' ' + predicted[i]

        vectoriser = CountVectorizer(input=vocabulary)
        vectoriser.fit([vocabulary])
        vectoriser._validate_vocabulary()

        actual_vector = vectoriser.transform([actual[i]])
        predicted_vector = vectoriser.transform([predicted[i]])

        cos_sim += cosine_similarity(actual_vector, predicted_vector)

    return 1.0*cos_sim/len(actual)
Exemple #24
0
def run(raw_data):
    try:
        input_json = json.loads(raw_data)
        
        df = pd.DataFrame.from_dict(input_json, orient='columns')
        df['processed_input'] = df.apply(lambda row: process_input(row), axis=1)
        
        count_vect = CountVectorizer(vocabulary=vocab)
        count_vect._validate_vocabulary()
        
        prediction = model.predict(count_vect.transform(df['processed_input']))
        
        labels_dict = {}
        labels_dict['assignment'] = 0
        labels_dict['quiz'] = 1
        labels_dict['homework'] = 2
        labels_dict['test'] = 3
        labels_dict['extra credit'] = 4

        arr_labels = ['assignment', 'quiz', 'homework', 'test', 'extra credit']

        for index, row in df.iterrows():
            label_match_school_category = re.search('assignment|quiz|homework|test|extra credit', row['School Category'].lower())
            label_match_assignment_name = re.search('assignment|quiz|homework|test|extra credit', row['Assignment Name'].lower()) 
            predicted_match_school_category = re.search(arr_labels[prediction[index]], row['School Category'].lower())
            predicted_match_assignment_name = re.search(arr_labels[prediction[index]], row['Assignment Name'].lower())     
            if label_match_school_category and (label_match_assignment_name is None) and (predicted_match_school_category is None):
                prediction[index] = labels_dict[label_match_school_category.group()]
            elif label_match_assignment_name and (label_match_school_category is None) and (predicted_match_assignment_name is None):
                prediction[index] = labels_dict[label_match_assignment_name.group()]
        
        out_json = json.dumps(prediction, cls=NumpyEncoder)
        return out_json
    
    except Exception as e:
        msg_exception = str(e)
        return json.dumps({"error": msg_exception})
Exemple #25
0
def process_input(row):
    input_merged = row['Assignment Name'] + ' ' + row['School Category']

    # gensim's preprocess_string through series of txt_filters which generates tokens array
    input_processed_tokens = " ".join(
        preprocess_string(input_merged, txt_filters))

    # input_processed_tokens is deduplicated to form final input string
    # input_processed = " ".join(sorted(set(input_processed_tokens), key=input_processed_tokens.index))
    return input_processed_tokens


vocabulary_to_load = pickle.load(open("vocab.pkl", 'rb'))
count_vect = CountVectorizer(vocabulary=vocabulary_to_load)
load_model = pickle.load(open("classifier_model.pkl", 'rb'))
count_vect._validate_vocabulary()


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


@app.route('/api', methods=['POST'])
def predict():

    # Get the data from the POST request.
    #data = request.get_json(force=True)
    data = request.json
Exemple #26
0
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)


vocabulary_to_load = pickle.load(open("vocab.pickle", 'r'))
vectorizer = CountVectorizer(ngram_range=(5000, 5000),
                             min_df=1,
                             vocabulary=vocabulary_to_load)
vectorizer._validate_vocabulary()

try:
    f = open('forest.pickle', 'rb')
    forest = pickle.load(f)
    f.close()
except:
    print "Prepare the classifier using - train_classifier.py and then try to use."

clean_test_blog = u"""Many government agencies have media accounts that they used for community outreach. TSA has done a great job trying to engage the public. So much so that Rolling Stone has ranked the Transportation Security Administration (TSA) Instagram N0. 4. It is “sandwiched between badgalriri (Rihanna) and Beyoncé on RollingStone.com’s 100 best Instagram accounts”. The TSA postings were mostly devoted to photos of items that they have confiscated from passengers’ luggage which Rolling Stone found fascinating, entertaining and terrifying.
TSA Instagram has more than half a million followers with over 150,000 comments to its posts and more than 2 million likes for its images. It’s most popular Instagram image is a life-size prop dummy from the “Texas Chainsaw Massacre” movie. The image received more than 10,000 likes.
Many government agencies have media accounts that they used for community outreach. TSA has done a great job trying to engage the public. So much so that Rolling Stone has ranked the Transportation Security Administration (TSA) Instagram N0. 4. It is “sandwiched between badgalriri (Rihanna) and Beyoncé on RollingStone.com’s 100 best Instagram accounts”. The TSA postings were mostly devoted to photos of items that they have confiscated from passengers’ luggage which Rolling Stone found fascinating, entertaining and terrifying. TSA Instagram has more than half a million followers with over 150,000 comments to its posts and more than 2 million likes for its images. It’s most popular Instagram image is a life-size prop dummy from the “Texas Chainsaw Massacre” movie. The image received more than 10,000 likes."""

clean_test_filter_blog = filter(lambda x: x in printable, clean_test_blog)
clean_test_blog = blog_to_words(clean_test_filter_blog)
def main():
    global dfDataSet
    global dfTrainData
    global dfTestData

    print("read data ...")
    readData(path_2_congnghe, label_of_news=0)
    readData(path_2_others, label_of_news=1)

    dfTrainData, dfTestData = train_test_split(dfDataSet,
                                               shuffle=True,
                                               test_size=0.3)
    dfTrainData = dfTrainData.reset_index(drop=True)
    dfTestData = dfDataSet.reset_index(drop=True)

    if (path.isfile("output\\dictionary")):
        print("read dictionary from file ...")
        readDictOfTrainFromFile()
    else:
        print("build dictionary ...")
        buildDictOfTrain()
        saveDictOfTrain2File()

    print("transform training dataset to doc-term matrix ...")
    countVctz = CountVectorizer(encoding='utf-8',
                                vocabulary=dictOfTrain,
                                min_df=0.1)
    countVctz._validate_vocabulary()
    countDocTerm = countVctz.transform(raw_documents=dfTrainData['contents'])

    print("use Mutual Information to select k-best features ...")
    if (len(dictOfTrain.items()) < 10000):
        selector = SelectKBest(score_func=mutual_info_classif, k='all')
    else:
        selector = SelectKBest(score_func=mutual_info_classif, k=10000)
    selector.fit(X=countDocTerm, y=dfTrainData['labels'])
    kbestDocTerm = selector.transform(X=countDocTerm)

    newFeature = []
    scoreFeature = []
    for choose, feature, score in zip(selector.get_support(),
                                      countVctz.get_feature_names(),
                                      selector.scores_):
        if (choose == True):
            newFeature.append(feature)
            scoreFeature.append(score)

    for score, feature in sorted(zip(scoreFeature, newFeature), reverse=True):
        log2File("%s:\t\t\t%s" % (str(feature), str(score)),
                 "output\\kbest-features")

    print("fit Multinomial Naive Bayes ...")
    nbClf = MultinomialNB()
    nbModel = nbClf.fit(X=kbestDocTerm, y=dfTrainData['labels'])

    print("predict test dataset")
    testDocTerm = countVctz.transform(raw_documents=dfTestData['contents'])
    kbestTestDocTerm = selector.transform(X=testDocTerm)
    predictResult = nbModel.predict(X=kbestTestDocTerm)

    clfReport = classification_report(y_true=dfTestData['labels'],
                                      y_pred=predictResult)
    refuMatrix = confusion_matrix(y_true=dfTestData['labels'],
                                  y_pred=predictResult)

    print(clfReport)
    log2File("Classification Report:\r", "output\\report")
    log2File(clfReport, "output\\report")

    print(refuMatrix)
    """load stop words """

    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)


#load a set of stop words
stopwords = get_stop_words("stopwords.txt")

#get the text
#overclean
#create a vocabulary of words,
#ignore words that appear in 85% of documents,
#eliminate stop words
#limit our vocabulary size to 10,000
cv = CountVectorizer(max_df=0.85,
                     ngram_range=(1, 2),
                     stop_words=stopwords,
                     max_features=10000)
cv._validate_vocabulary()
cvfit = cv.fit(overclean)
word_count_vector = cvfit.transform(overclean)
pickle.dump(cvfit, open('cvfit.joblib', 'wb'))
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
pickle.dump(tfidf_transformer, open('tfidf_transformer.joblib', 'wb'))
# you only needs to do this once
feature_names = cv.get_feature_names()
pickle.dump(feature_names, open('feature_names.joblib', 'wb'))
Exemple #29
0
def get_edge_weights(train_set, test_doc, variant = "co-occurrences", model = ""):
    final_weights = []
    
    test_doc_candidates =  try1.extractKeyphrasesTextRank(test_doc)

    if variant == "co-occurrences":
        
        words_nodes = []
        for doc in train_set:
            words_nodes += try1.extractKeyphrasesTextRank(doc)
        
        vectorizer = CountVectorizer(binary = True,
                                     analyzer = 'word', 
                                     ngram_range = (1,3), 
                                     stop_words = 'english',
                                     token_pattern = r"(?u)\b[a-zA-Z][a-zA-Z-]*[a-zA-Z]\b", 
                                     lowercase = True,
                                     vocabulary = list(unique_everseen(itertools.chain.from_iterable(words_nodes)))
                                     )
        vectorizer._validate_vocabulary()                
        #https://stackoverflow.com/questions/35562789/how-do-i-calculate-a-word-word-co-occurrence-matrix-with-sklearn
        #https://github.com/scikit-learn/scikit-learn/issues/10901
                
        X = vectorizer.fit_transform(test_doc_candidates[0])
        X = lil_matrix(X)
        Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
        Xc[Xc > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below) bem explicado no link above 
        Xc.setdiag(0) #  fill same word cooccurence to 0
        Xc = lil_matrix(Xc)

        feature_names = vectorizer.get_feature_names()
        final_weights = format_weights(Xc.tocoo(), feature_names)
        
    elif variant == "embeddings":
        #https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/
        feature_names = list(unique_everseen(itertools.chain.from_iterable(test_doc_candidates)))
        
        similarity_matrix = lil_matrix((len(feature_names), len(feature_names)), dtype=float)
        
        test_doc = try1.extractKeyphrasesTextRank(test_doc)
        
        for sent in test_doc:
            row_m = -1
            for gram1 in feature_names:
                 
                col_m = -1
                row_m += 1
                
                for gram2 in feature_names:
                    col_m += 1
                    
                    if gram1 in sent and gram2 in sent and gram1 != gram2: 
                    
                        i = 0
                        acc = 0
                        
                        grams_i = gram1.split()
                        grams_j = gram2.split()
                        
                        for g_i in grams_i:
                            for g_j in grams_j:
                                i += 1
                                try:
                                    acc += model.similarity(g_i, g_j)
                                except KeyError as e:
                                    continue
                        similarity = acc/i
                        similarity_matrix[row_m, col_m] = similarity
                            
        final_weights = format_weights(similarity_matrix.tocoo(), feature_names)
 
    elif variant == "edit_distance":
       import nltk
       feature_names = list(unique_everseen(itertools.chain.from_iterable(test_doc_candidates)))  
            
       edit_distance_matrix = lil_matrix((len(feature_names), len(feature_names)), dtype=float)         
       
       test_doc = try1.extractKeyphrasesTextRank(test_doc)
       for sent in test_doc:
           row_m = -1
           for gram1 in feature_names:
                 
                col_m = -1
                row_m += 1
                
                for gram2 in feature_names:
                    col_m += 1
                    
                    if gram1 in sent and gram2 in sent and gram1 != gram2: 
                    #http://www.nltk.org/howto/metrics.html
                    #https://www.nltk.org/api/nltk.metrics.html
                    #https://www.datacamp.com/community/tutorials/fuzzy-string-python
                    #edit_distance - 0.0153571428571
                    #binary_distance - 0.0280357142857
                    #ratio
                        #edit_distance_matrix[row_m,col_m] = fuzz.token_set_ratio(gram1, gram2)
                        edit_distance_matrix[row_m,col_m] = 1/(nltk.edit_distance(gram1, gram2))
            
       final_weights = format_weights(edit_distance_matrix.tocoo(), feature_names)
       
    elif variant == "binary_distance":
       import nltk
       feature_names = list(unique_everseen(itertools.chain.from_iterable(test_doc_candidates)))  
            
       edit_distance_matrix = lil_matrix((len(feature_names), len(feature_names)), dtype=float)         
       
       test_doc = try1.extractKeyphrasesTextRank(test_doc)
       for sent in test_doc:
           row_m = -1
           for gram1 in feature_names:
                 
                col_m = -1
                row_m += 1
                
                for gram2 in feature_names:
                    col_m += 1
                    
                    if gram1 in sent and gram2 in sent and gram1 != gram2: 
                        edit_distance_matrix[row_m,col_m] = 1/(nltk.binary_distance(gram1, gram2))
            
       final_weights = format_weights(edit_distance_matrix.tocoo(), feature_names)
    
    elif variant == "levenshtein_ratio_and_distance":
       feature_names = list(unique_everseen(itertools.chain.from_iterable(test_doc_candidates)))  
            
       levenshtein_ratio_and_distance_matrix = lil_matrix((len(feature_names), len(feature_names)), dtype=float)         
       
       test_doc = try1.extractKeyphrasesTextRank(test_doc)
       for sent in test_doc:
           row_m = -1
           for gram1 in feature_names:
                 
                col_m = -1
                row_m += 1
                
                for gram2 in feature_names:
                    col_m += 1
                    
                    if gram1 in sent and gram2 in sent and gram1 != gram2: 
                        levenshtein_ratio_and_distance_matrix[row_m,col_m] = levenshtein_ratio_and_distance(gram1, gram2, ratio_calc = True)
        
       final_weights = format_weights(levenshtein_ratio_and_distance_matrix.tocoo(), feature_names)
       
    elif variant == "ratio":
       from fuzzywuzzy import fuzz
       feature_names = list(unique_everseen(itertools.chain.from_iterable(test_doc_candidates)))  
            
       edit_distance_matrix = lil_matrix((len(feature_names), len(feature_names)), dtype=float)         
       
       test_doc = try1.extractKeyphrasesTextRank(test_doc)
       for sent in test_doc:
           row_m = -1
           for gram1 in feature_names:
                 
                col_m = -1
                row_m += 1
                
                for gram2 in feature_names:
                    col_m += 1
                    
                    if gram1 in sent and gram2 in sent and gram1 != gram2: 
                        edit_distance_matrix[row_m,col_m] = fuzz.ratio(gram1, gram2)
            
       final_weights = format_weights(edit_distance_matrix.tocoo(), feature_names)
        
    elif variant == "co-occurrences_plus_embeddings":
        #https://www.irit.fr/publis/SIG/2018_SAC_MRR.pdf
        dict_cooccurrences = get_edge_weights(train_set, test_doc, variant="co-occurrences", model=model)
        
        dict_embeddings = get_edge_weights(train_set, test_doc, variant="embeddings", model=model)
        
        final_weights = { k: dict_cooccurrences.get(k, 0) + dict_embeddings.get(k, 0) for k in set(dict_cooccurrences) & set(dict_embeddings) }

    else:
        print(">>UNKNOWN>>EDGE WEIGHTS>>", variant)

    return final_weights
Exemple #30
0
class EmbeddingsTool():
    '''Utility class to build, query and provide example usage of
    text embeddings.
    '''
    NDIM = 100

    def __init__(self, *args, **kwargs):
        '''Returns EmbeddingsTools that is ready for use
        Keyword Arguments:
        embeddings_file -- the file from (to) which the embeddings
         are serialized.
        doc2vec -- if True, use Doc2Vec embeddings, otherwise use Word2Vec.
        '''

        self.embeddings_file = kwargs['embeddings_file']
        self.lda_file = 'lda.model'
        self.tf_file = 'tf.model'

        self.doc2vec = kwargs['doc2vec']
        time0 = time.time()
        if self.embeddings_file and os.path.exists(self.embeddings_file):
            # Google's Pre-trained model.
            if 'GoogleNews' in self.embeddings_file:
                print('loading Google pretrained model file {}'.format(
                    self.embeddings_file))
                self.embeddings = KeyedVectors.load_word2vec_format(
                    self.embeddings_file, binary=True)
                print('  ... took {} sec'.format(time.time() - time0))
            else:  # One of our "make_embeddings" models
                if self.doc2vec:
                    self.embeddings = doc2vec.Doc2Vec.load(
                        self.embeddings_file)
                else:
                    self.embeddings = KeyedVectors.load(self.embeddings_file)

        if os.path.exists(self.lda_file):
            self.lda = joblib.load(self.lda_file)

        self.tf_vectorizer = CountVectorizer(max_df=0.95,
                                             min_df=2,
                                             max_features=20000,
                                             stop_words='english')
        return

    def info(self):
        '''Dump out information about the specified embeddings file.'''
        def info_wv(self, wv):
            '''Dump out information about a Word2Vec embedding '''
            print('Word Embeddings.')
            print('  Length:  {}.  First 3 words {}'.format(
                len(wv.vocab), wv.index2word[:3]))
            print('  Example word embedding.  wv["{}"]:'.format('king'))
            print('    V length:  {}, embedding follows:  {}'.format(
                len(wv['king']), wv['king']))

        if self.doc2vec:
            print('Document Embeddings.')
            d2v = self.embeddings
            print('  Length:  {}.  First 3 entries {}'.format(
                len(d2v.docvecs.doctags),
                d2v.docvecs.doctags.keys()[:3]))
            tag0 = d2v.docvecs.doctags.keys()[0]
            print('  Example document embedding.  d2v.docvecs["{}"]:'.format(
                tag0))
            print('    V length:  {}, embedding follows:  {}'.format(
                len(d2v.docvecs[tag0]), d2v.docvecs[tag0]))
            print('The word embeddings are available via the "wv" member:')
            info_wv(self, d2v.wv)
        else:
            info_wv(self, self.embeddings)


#

    def load_data(self, input):
        self.df = read_json(input)

        return self.df

    def make_embeddings(self):
        '''Build the embeddings and serialize to disk.'''
        if 'GoogleNews' in self.embeddings_file:  # Google's pre-trained model
            raise ValueError("attempting to overwrite the Google corpus.")

        self.df['sentences'] = self.df['title'] + ' ' + self.df['body']
        self.df.drop_duplicates(subset='sentences', inplace=True)

        # Strips out punctuation, Lower cases, removes English stop words
        # and white space.  Leaves numbers
        analyze = self.tf_vectorizer.build_analyzer()

        self.df['tokenlist'] = [analyze(s) for s in self.df.sentences.tolist()]
        time0 = time.time()
        print('Fitting embeddings ... (hard coded dimensions is {})'.format(
            EmbeddingsTool.NDIM))
        if self.doc2vec:
            taggeddocs = []
            for i, tokenlist in enumerate(self.df.tokenlist):
                td = TaggedDocument(tokenlist, [unicode(self.df.id[i])])
                taggeddocs.append(td)
            self.embeddings = doc2vec.Doc2Vec(alpha=0.025,
                                              min_alpha=0.025,
                                              size=EmbeddingsTool.NDIM,
                                              window=8,
                                              min_count=5,
                                              workers=4)
            # to do:  continue training
            self.embeddings.build_vocab(taggeddocs)
            print('  ... build_vocab took {} sec'.format(time.time() - time0))
            for epoch in range(10):
                self.embeddings.train(
                    taggeddocs,
                    total_examples=self.embeddings.corpus_count,
                    epochs=self.embeddings.iter)
                print('  ... training epoch {} through {} sec'.format(
                    epoch,
                    time.time() - time0))
                self.embeddings.alpha -= 0.002
                self.embeddings.min_alpha = self.embeddings.alpha
        else:
            # build the embeddings by training the network, 1 call.
            self.embeddings = word2vec.Word2Vec(self.df.tokenlist,
                                                size=EmbeddingsTool.NDIM,
                                                min_count=10,
                                                workers=4)
        print('  ... took {} sec'.format(time.time() - time0))
        time0 = time.time()
        print('Saving embeddings to output {}'.format(self.embeddings_file))
        if self.doc2vec:
            print('writing doc2vec')
            self.embeddings.delete_temporary_training_data(
                keep_doctags_vectors=True, keep_inference=True)
            self.embeddings.save(self.embeddings_file)
        else:
            self.embeddings.wv.save(self.embeddings_file)
        print('  ... took {} sec'.format(time.time() - time0))

        # LDA
        # Use tf (raw term count) features for LDA
        print('Getting tf ..')
        tf = self.tf_vectorizer.fit_transform(self.df.sentences)
        # compare tf and self.df.tokenlist
        self.lda = LatentDirichletAllocation(n_components=50,
                                             max_iter=10,
                                             learning_method='online',
                                             learning_offset=10.,
                                             random_state=0)
        time0 = time.time()
        print('Training LDA ...')
        self.lda.fit(tf)
        print('  ... took {} sec'.format(time.time() - time0))
        joblib.dump(self.lda, self.lda_file)
        joblib.dump(self.tf_vectorizer.vocabulary_, self.tf_file)
        return

    def dumptopics(self, n_top_words):
        if os.path.exists(self.tf_file):
            self.vocab = joblib.load(self.tf_file)
        self.tf_vectorizer = CountVectorizer(vocabulary=self.vocab)
        self.tf_vectorizer._validate_vocabulary()
        feature_names = self.tf_vectorizer.get_feature_names()

        for topic_idx, topic in enumerate(self.lda.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            print(message)

    def query(self, q):
        '''Execute Queries against the model.

        q -- this is the query string.
          It can be either a single term (e.g. 'king') which will use the
          'most_similar' API OR
          It can be a call to the method on either KeyedVectors (Word2Vec)
          or DocvecsArray (Doc2Vec), extended as necessary
          (e.g. "doesnt_match('breakfast cereal lunch dinner'.split())")
         See https://radimrehurek.com/gensim/models/doc2vec.html and
             https://radimrehurek.com/gensim/models/word2vec.html
             and the example usages of this tool (--help)
        '''
        matches = []
        if self.doc2vec:
            d2v = self.embeddings.docvecs
            if (len(q.split()) == 1):
                #q = self.analyze(q)[0]
                matches = d2v.most_similar('{}'.format(q))
            else:
                matches = eval('d2v.{}'.format(q))
        else:
            if len(q.split()) == 1:
                q = self.analyze(q)[0]
                matches = self.embeddings.most_similar(
                    positive=['{}'.format(q)])
            else:
                matches = eval('self.embeddings.{}'.format(q))

        return q, matches

    def csv_dump(self):
        ''' Dump Word2Vec embeddings to CSV file. '''
        for word in self.embeddings.index2word:
            try:
                print('{}, '.format(word), end='')
                for feat in self.embeddings[word]:
                    print('{}, '.format(feat), end='')
                print('')
            except UnicodeError:  # non-ascii characters!
                continue