Esempio n. 1
0
 def spot_check(self, X_train, Y_train, n_splits=20):
     models = []
     models.append(
         ('SVC', OneVsRestClassifier(SVC(probability=True)),
          ('RM',
           OneVsRestClassifier(
               RandomForestClassifier(criterion='entropy'))),
          ('GB', OneVsRestClassifier(GradientBoostingClassifier())),
          ('MLP', OneVsRestClassifier(MLPClassifier())),
          ('LR', OneVsRestClassifier(LogisticRegression()))))
     acc = []
     loss = []
     names = []
     scoring = {
         'hamming': make_scorer(hamming_loss),
         'Accuracy': make_scorer(accuracy_score)
     }
     for i, tup in enumerate(models):
         name = tup[0]
         model = tup[1]
         kfold = KFold(n_splits=n_splits, random_state=1, shuffle=True)
         cv_results = cross_validate(model,
                                     X_train,
                                     Y_train,
                                     cv=kfold,
                                     scoring=scoring,
                                     return_train_score=True)
         acc.append(cv_results['test_Accuracy'])
         loss.append(cv_results['test_hamming'])
         names.append(name)
     return names, acc, loss
Esempio n. 2
0
def load_models(projectname):
    models = []
    for i in range(MODEL_NUMBER):
        model = model_from_json(
            open('D:/TSE/python/missplaceclass/models/' + projectname + '-' +
                 (str)(i) + '.json').read())
        model.load_weights('D:/TSE/python/missplaceclass/models/' +
                           projectname + '-' + (str)(i) + '.h5')
        models.append(model)
    return models
Esempio n. 3
0
def build_model(tokenizer):
    embedding_model = word2vec.Word2Vec.load(
        'D:/TSE/python/missplaceclass/embedding_model/new_model_1.bin')
    word_index = tokenizer.word_index
    nb_words = len(word_index)
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if word == 'false':
            print(word)
        embedding_vector = embedding_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    models = []
    for i in range(MODEL_NUMBER):
        embedding_layer = Embedding(nb_words + 1,
                                    EMBEDDING_DIM,
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    weights=[embedding_matrix],
                                    trainable=False)
        model_left = Sequential()
        model_left.add(embedding_layer)
        model_left.add(Conv1D(128, 1, padding="same", activation='tanh'))
        model_left.add(Conv1D(128, 1, activation='tanh'))
        model_left.add(Conv1D(128, 1, activation='tanh'))
        model_left.add(Flatten())

        model_right = Sequential()
        model_right.add(
            Conv1D(128,
                   1,
                   input_shape=(8, 1),
                   padding="same",
                   activation='tanh'))
        model_right.add(Conv1D(128, 1, activation='tanh'))
        model_right.add(Conv1D(128, 1, activation='tanh'))
        model_right.add(Flatten())

        output = merge.Concatenate()([model_left.output, model_right.output])
        output = Dense(128, activation='tanh')(output)
        output = Dense(1, activation='sigmoid')(output)
        input_left = model_left.input
        input_right = model_right.input

        model = Model([input_left, input_right], output)
        model.compile(loss='binary_crossentropy',
                      optimizer='Adadelta',
                      metrics=['accuracy'])
        models.append(model)
    return models
Esempio n. 4
0
def topic_model_coherence_generator(corpus, texts, dictionary, 
                                    start_topic_count=2, end_topic_count=10, step=1,
                                    cpus=1):
    
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        mallet_lda_model = gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, corpus=corpus,
                                                            num_topics=topic_nums, id2word=dictionary,
                                                            iterations=500, workers=cpus)
        cv_coherence_model_mallet_lda = gensim.models.CoherenceModel(model=mallet_lda_model, corpus=corpus, 
                                                                     texts=texts, dictionary=dictionary, 
                                                                     coherence='c_v')
        coherence_score = cv_coherence_model_mallet_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(mallet_lda_model)
    
    return models, coherence_scores
Esempio n. 5
0
def find_best_topic_num(dataset_name, lim_low, lim_high):
    coherences = []
    models = []
    sentences, dic, corpus = load_topic_data(dataset_name)
    for i in range(lim_low, lim_high + 1):
        lda_model = gensim.models.ldamulticore.LdaMulticore(
            corpus=corpus,
            id2word=dic,
            num_topics=i,
            random_state=100,
            chunksize=100,
            passes=10,
            per_word_topics=True)  # update_every=1,
        models.append(lda_model)
        coherences.append(get_coherence_score(lda_model, sentences, dic))
    max_coherence_index = coherences.index(max(coherences))
    draw_plot(dataset_name, list(range(lim_low,
                                       len(coherences) + lim_low)), coherences,
              max(coherences), max_coherence_index + lim_low)
    models[max_coherence_index].save("../models/tm_" + dataset_name + ".model")
Esempio n. 6
0
def build_model(num_topics):
    print('Building model', flush=True)
    # class GutenbergCorpusBOW(object):
    #     def __iter__(self):
    #         for document in os.listdir('Gutenberg/txt'):
    #             splitdoc = []
    #             for line in open('Gutenberg/txt/' + document):
    #                 splitdoc.extend(line.lower().split())
    #             yield dictionary.doc2bow(splitdoc)
    #     def __len__(self):
    #         return len(os.listdir('Gutenberg/txt'))

    docs = []
    for file in os.listdir("resources/"):
        with open("resources/" + file, encoding='utf8') as doc:
            try:
                txt = doc.read()
            except:
                continue
        docs.append(txt)
    docs = process(docs)
    dictionary = Dictionary(docs)

    # Remove rare and common tokens.
    # Filter out words that occur too frequently or too rarely.
    max_freq = 0.5
    min_wordcount = 2
    dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
    # print(dictionary)
    # _ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

    corpus = [dictionary.doc2bow(doc) for doc in docs]
    models = []
    for i in range(10):
        lda = LdaModel(corpus, num_topics=num_topics)
        models.append(lda)

    with open("topic_models_top.pkl", "wb") as mfile:
        print("Writing topic_models_top.pkl", flush=True)
        pickle.dump((models, dictionary), mfile)
Esempio n. 7
0
def build_models(data, data_labels, unique_labels, N=30):
    models = []
    vocabularies = []
    LDA = gensim.models.ldamodel.LdaModel
    for i in range(len(unique_labels)):
        L = unique_labels[i]
        corpus = [
            word_tokenize(data[i]) for i in range(len(data))
            if data_labels[i] == L
        ]
        vocabulary = corpora.Dictionary(corpus)
        BOW = [vocabulary.doc2bow(doc) for doc in corpus]
        LDAmodel = LDA(BOW,
                       num_topics=N,
                       id2word=vocabulary,
                       passes=25,
                       alpha='auto',
                       minimum_probability=0.01,
                       random_state=30)
        models.append(LDAmodel)
        vocabularies.append(vocabulary)
    return models, vocabularies
def iterate_topics(topic_range=range(10, 50, 5), number_of_records=None):
    corpus_list, titles = load_corpus('../texts/gists/',
                                      number_of_records=number_of_records)
    with open('../outputs/iter_titles.pkl', 'wb') as fp:
        pickle.dump(titles, fp)
    models = []
    count_vect = fit_vectorizer(corpus_list)
    corpus, id2word = convert_corpus(corpus_list, count_vect)
    with open('../outputs/iter_corpus.pkl', 'wb') as fp:
        pickle.dump(corpus, fp)
    for num_tops in topic_range:
        lda_loop = fit_lda(num_tops,
                           corpus,
                           id2word,
                           100,
                           multicore=2,
                           save=True)
        plot_distances(lda_loop, title=f'Differences {num_tops} Topics')
        models.append(lda_loop)
    plot_distances(
        models[0],
        title=f'Comparing {topic_range[0]} to {topic_range[-1]} Topics',
        other_model=models[-1])
    return models
Esempio n. 9
0
	def model_validation(self,X_train,Y_train,X_test,Y_test,categories=[],subtitle='annot4'):
		if len(categories)<1:
			categories=self.categories
		models = []  
		models.append(('SVC', OneVsRestClassifier(SVC(probability=True,
												  C=8,
												  class_weight='balanced',
												  degree=1,
												  gamma='scale',
												  kernel='rbf',
												 break_ties=False, cache_size=200,
												 coef0=0.0,
												 decision_function_shape='ovr',
												  max_iter=-1,
												 random_state=None,
								  shrinking=True, tol=0.001, verbose=False))))
   
		models.append(('RM', OneVsRestClassifier(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
						criterion='entropy', max_depth=None, max_features='auto',
						max_leaf_nodes=None, max_samples=None,
						min_impurity_decrease=0.0, min_impurity_split=None,
						min_samples_leaf=1, min_samples_split=2,
						min_weight_fraction_leaf=0.0, n_estimators=100,
						n_jobs=None, oob_score=False, random_state=None,
						verbose=0, warm_start=False))))
	
		models.append(('ens',OneVsRestClassifier(estimator=EnsembleClassifier(classifiers=[["SVC",models[0][1]],
																  ["RM",models[1][1]]]))))
		trained_models=[]
		for i,tup in enumerate(models):
			model=tup[1].fit(X_train, Y_train)
			trained_models.append(model)
			pkl_filename = tup[0]+"pickle_model.pkl"
			with open(self.results+pkl_filename, 'wb') as file:
				print ("Saved classifier: "+tup[0])
				pickle.dump(model, file)
			prediction_prob=model.predict_proba(X_test)
			Y_pred = model.predict(X_test)
			test_array = Y_test.to_numpy()
			title='valid'+'_'+str(tup[0])+'_'+subtitle
			self.roc_curve_plot(test_array,prediction_prob,title,categories)
			self.pr_curve_plot(test_array,prediction_prob,title,categories)
			all_mats=self.multi_cm(Y_test,Y_pred,title)
			acc=accuracy_score(Y_test, Y_pred),
			loss=hamming_loss(Y_test, Y_pred)
			acc_c,recall,prec=self.evaluate_ml_metrics(Y_test,Y_pred)
			print(acc,loss,acc_c,recall,prec, file=open(self.results+'_'+str(tup[0])+'_'+subtitle+"output.txt", "a"))
		return trained_models
import time

import gensim.models

from python_code.model.my_tokenize.tokenizer import cut

model_names = ['model_78w.bin', 'whole_content_1_100_80w.bin']

models = []
for model_name in model_names:
    t = time.time()
    models.append(gensim.models.Word2Vec.load('../bin/' + model_name))
    print('load model : ' + model_name + ' spend ' + str(time.time() - t) +
          ' seconds')


def similarity_test(arg1='台灣', arg2='中國'):
    print('similarity test ' + arg1 + " compare with " + arg2)
    for model in models:
        print(model.similarity(arg1, arg2))


def doesnt_match_test():
    data_set = [["早餐", "午餐", "晚餐", "宵夜", "車禍"], ["國文", "英文", "數學", "物理", "電腦"],
                ["爸爸", "媽媽", "書包"]]
    for data in data_set:
        print(data)
        for model in models:
            print(model.doesnt_match(data))

Esempio n. 11
0
data_file = 'processed-blog-posts-noun%s' % str(len(raw_data))
tokenized_posts_train = get_serialized_entity(
    'train-posts-%s', lambda: process_blog_posts(train_set['text']))
tokenized_posts_test = get_serialized_entity(
    'test-posts-%s', lambda: process_blog_posts(test_set['text']))

topic_counts = [10, 20, 50]
log_perplexities, models, dictionaries = [], [], []

for num_topic in topic_counts:
    model_file = '%s-lda-model-topics-%s' % (data_file, str(num_topic))
    (corpus, dictionary, model) = get_serialized_entity(
        model_file, lambda: train_LDA(tokenized_posts_train, num_topic))
    test_corpus = [dictionary.doc2bow(text) for text in tokenized_posts_test]
    log_perplexities.append(model.log_perplexity(test_corpus))
    models.append(model)
    dictionaries.append(dictionary)

#get 200 texts for a sample set
sample_texts = test_set['text'][1:200].copy()
ner_tags = sample_texts.apply(lambda text: ", ".join(get_ents(text)))

selected_model_idx = 1
selected_model = models[selected_model_idx]
selected_dictionary = dictionaries[selected_model_idx]

ner_tags, lda_tags = [], []
for text in sample_texts:
    ner_tags.append(", ".join(get_ents(text)))
    lda_tags.append(", ".join(
        get_LDA_tags(selected_model, selected_dictionary, text)))
# determine the number of topics
coherenceScores = []
models = []
# try the number of topics from 2 to 10
for num_topics in range(2, 10, 1):
    # build the lad model
    ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics,
                                               random_state=0,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
    models.append(ldaModel)
    # calculate the coherence score
    coherencemodel = CoherenceModel(model=ldaModel,
                                    texts=tokens_lst,
                                    dictionary=id2word,
                                    coherence='c_v')
    coherenceScores.append(round(coherencemodel.get_coherence(), 3))

print(coherenceScores)

# In[10]:

# plot the coherence scores against the number of topics
fig = plt.figure(figsize=(13, 8))
x = range(2, 10, 1)
plt.plot(x, coherenceScores, color="steelblue")
Esempio n. 13
0
def find_best_model_cv(n_topic_range,
                       texts,
                       id2word,
                       corpus,
                       threshold=None,
                       random_state=42,
                       plot=True,
                       verbose=False):
    """
    Searches for the best model in a given range by C_v coherence value

    Parameters:
        - `n_topic_range`
            a range of values for the `num_topics` parameter of a gensim LDA model to try
        - `texts`
            a list of documents broken into words
        - `id2word`
            a dictionary containing word encodings
        - `corpus`
            the result of mapping each word in `texts` to its value in `id2word`
        - `random_state` 
            a random state for use in a gensim LDA model
        - `threshold`
            a float that specifies a coherence value that if reached will cause the function to return early
        - `plot`
            a boolean specifying whether or not to plot coherence values against each `num_topics` value
        - `verbose`
            a boolean specifying whether or not to print updates
    
    Returns: a tuple containing the best model, the list of all models attempted, and a list of all coherence values obtained, respectively.
    """
    models = []
    coherence_vals = []

    for n_topics in n_topic_range:

        # Print percentage progress
        if verbose:
            diff = max(n_topic_range) - n_topic_range.start
            print(
                str(round(100 * (n_topics - n_topic_range.start) / diff, 1)) +
                "% done")

        lda_model = LdaModel(corpus=corpus,
                             id2word=id2word,
                             num_topics=n_topics,
                             random_state=random_state,
                             update_every=1,
                             chunksize=100,
                             passes=10,
                             alpha='auto',
                             per_word_topics=True)
        co_model = CoherenceModel(lda_model,
                                  texts=texts,
                                  dictionary=id2word,
                                  coherence="c_v")
        coherence = co_model.get_coherence()

        models.append(lda_model)
        coherence_vals.append(coherence)

        if threshold is not None and coherence > threshold:
            if verbose:
                print('Returning early with a coherence value of ' +
                      str(coherence))

            if plot:
                actual_range = range(n_topic_range.start,
                                     n_topics + n_topic_range.step,
                                     n_topic_range.step)
                plt.plot(actual_range, coherence_vals, 'b')
                plt.show()

            return lda_model, models, coherence_vals

    if plot:
        # The portion of the range that was actually iterated through
        plt.plot(n_topic_range, coherence_vals, 'b')
        plt.show()

    return models[np.argmax(coherence_vals)], models, coherence_vals