Beispiel #1
0
def load_errors():
    len_history = 100
    p_error = np.full((len(seed_list), len(train_mode_list),
                       len(num_particles_list), len_history),
                      np.nan,
                      dtype=np.float)
    q_error = np.full((len(seed_list), len(train_mode_list),
                       len(num_particles_list), len_history),
                      np.nan,
                      dtype=np.float)
    grad_std = np.full((len(seed_list), len(train_mode_list),
                        len(num_particles_list), len_history),
                       np.nan,
                       dtype=np.float)
    for seed_idx, seed in enumerate(seed_list):
        for train_mode_idx, train_mode in enumerate(train_mode_list):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                print('{} {} {}'.format(seed, train_mode, num_particles))
                model_folder = util.get_most_recent_model_folder_args_match(
                    seed=seed,
                    train_mode=train_mode,
                    num_particles=num_particles,
                    init_near=init_near)
                if model_folder is not None:
                    stats = util.load_object(util.get_stats_path(model_folder))
                    p_error[seed_idx, train_mode_idx,
                            num_particles_idx, :len(stats.p_error_history
                                                    )] = stats.p_error_history
                    q_error[seed_idx, train_mode_idx,
                            num_particles_idx, :len(stats.q_error_history
                                                    )] = stats.q_error_history
                    grad_std[seed_idx, train_mode_idx, num_particles_idx, :len(
                        stats.grad_std_history)] = stats.grad_std_history
    return p_error, q_error, grad_std
def train_sklearn_boosting(training_data_dump):
    training_data = util.load_object(training_data_dump)
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=1),
                              n_estimators=20)
    #model = AdaBoostRegressor(SVR(kernel='linear'), n_estimators=20)
    #model = RandomForestRegressor(n_estimators = 50)
    model = model.fit(training_data[:, :-1], training_data[:, -1])
    return model
def predict_random_forest(model, test_data_dump):
    test_data = util.load_object(test_data_dump)
    predictions = []
    targets = []
    for sample in test_data:
        targets.append(sample[-1])
        predictions.append(model.predict(sample[:-1]))
    return get_average_kappa(targets, predictions)
def predict_sklearn_random_forest(model, test_data_dump):
    test_data = util.load_object(test_data_dump)
    predictions = []
    targets = []

    targets = test_data[:, -1]
    predictions = model.predict(test_data[:, :-1])
    return get_average_kappa(targets, predictions)
Beispiel #5
0
def load_formatter_fn(formatter):
    '''
    >>> load_formatter_fn('logagg.formatters.basescript') #doctest: +ELLIPSIS
    <function basescript at 0x...>
    '''
    obj = util.load_object(formatter)
    if not hasattr(obj, 'ispartial'):
        obj.ispartial = util.ispartial
    return obj
Beispiel #6
0
    def msg_store(self):
        targets = []

        for t in self.args.target:
            imp_path, args = self._parse_msg_target_arg(t)
            target_class = util.load_object(imp_path)
            target_obj = target_class(**args)
            targets.append(target_obj)

        return targets
Beispiel #7
0
 def load_results_from_disk(self):
     results_list = []
     for i in range(0, len(self.classifier_list)):
         results = util.load_object(
             self.CLASSIFIERS_AND_RESULTS_DIR_PATH +
             util.convert_name_to_filename(self.classifier_name_list[i]) +
             '_' + self.classifier_iter + '_results.pkl')
         results_list.append(results)
     self.results = results_list
     return results_list
Beispiel #8
0
def plot_variance_analysis():
    num_particles_list = [2, 5, 10, 20, 50, 100]
    [
        vimco_grad, vimco_one_grad, reinforce_grad, reinforce_one_grad,
        two_grad, log_evidence_stats, log_evidence_grad, wake_phi_loss_grad,
        log_Q_grad, sleep_loss_grad
    ] = util.load_object('./variance_analysis/data.pkl')

    fig, axss = plt.subplots(2,
                             10,
                             figsize=(20, 4),
                             dpi=100,
                             sharex=True,
                             sharey='row')
    for i, stats in enumerate([
            vimco_grad, vimco_one_grad, reinforce_grad, reinforce_one_grad,
            two_grad, log_evidence_grad, wake_phi_loss_grad, log_Q_grad,
            sleep_loss_grad, log_evidence_stats
    ]):
        for j in range(2):
            axss[j, i].plot(stats[:, j], color='black')

    axss[0, 0].set_ylabel('mean')
    axss[1, 0].set_ylabel('std')

    for ax in axss[0]:
        ax.set_yticks([ax.get_yticks()[0], ax.get_yticks()[-1]])

    for ax in axss[1]:
        ax.set_yscale('log')
        # ax.set_yticks([0, ax.get_yticks()[-1]])
        # ax.set_yticks([ax.get_yticks()[0], ax.get_yticks()[-1]])
        # ax.set_yticks([1e-2, 1e4])
        ax.set_xlabel('K')

    for axs in axss:
        for ax in axs:
            ax.set_xticks(range(len(num_particles_list)))
            ax.set_xticklabels(num_particles_list)
            sns.despine(ax=ax, trim=True)

    for ax, title in zip(axss[0], [
            r'$g_{VIMCO}$', r'$g_{VIMCO}^1$', r'$g_{REINFORCE}$',
            r'$g_{REINFORCE}^1$', r'$g^2$', r'$\nabla_{\theta} \log Z_K$',
            r'$\nabla_{\phi}$ wake-$\phi$ loss', r'$\nabla_{\phi} \log Q$',
            r'$\nabla_{\phi}$ sleep loss', r'$\log \hat Z_K$'
    ]):
        ax.set_title(title)

    fig.tight_layout()
    if not os.path.exists('./plots/'):
        os.makedirs('./plots/')
    filename = './plots/variance_analysis.pdf'
    fig.savefig(filename, bbox_inches='tight')
    print('saved to {}'.format(filename))
Beispiel #9
0
def main():

    # step 1 模型
    train_data = pd.read_csv(train_data_path, sep='\001', header=None)
    train_data.columns = ['id', 'title', 'doc', 'key_words']
    train_candidates = util.load_object(train_candidate_path)
    Featutes, labels = train_model.build_train_sample(train_data,
                                                      train_candidates)
    print(np.sum(labels))
    print(Featutes.shape)
    dt = train_model.train_class_model(Featutes, labels)

    # test
    test_data = pd.read_csv(test_data_path, sep='\001', header=None)
    stop_words = util.stopwordslist(stop_words_path)
    test_data.columns = ['id', 'title', 'doc']
    ids = test_data['id'].tolist()
    titles = test_data['title'].tolist()
    docs = test_data['doc'].tolist()
    test_candidates = util.load_object(test_candidate_path)
    sample_label_probs = train_model.get_test_sample_prob(
        dt, test_data, test_candidates)

    # util.save_object(sample_label_probs, './data/sample_labels_probs_add_title.pickle')
    # sample_label_probs = util.load_object('./data/sample_labels_probs_add_title.pickle')
    # sample_label_probs = util.load_object('./data/sample_title_doc_labels_probs1.pickle')
    with open('last_summit2.csv', 'w') as file:
        file.write('id,label1,label2\n')
        for (id, title, doc, words_prob) in zip(ids, titles, docs,
                                                sample_label_probs):
            if id == 'D087215':
                print('test......')
            if id == 'D087268':
                print('test......')

            title = str(title).strip()
            last_labes = extract_title_doc(id, title, stop_words, words_prob)
            labels_str = ",".join(last_labes)
            if len(last_labes) <= 1:
                labels_str += ','
            file.write(id + "," + labels_str)
            file.write("\n")
Beispiel #10
0
 def __init__(self, disease_name, fhir_resource, group_name, description,
              pickle_path):
     self.disease = disease_name
     self.description = description
     self.name = group_name
     self.resource_name = fhir_resource
     self.set_resource(fhir_resource)
     self.lines = {}
     self.pickle_path = '{}{}-{}.p'.format(pickle_path, disease_name,
                                           fhir_resource)
     self.criteria = load_object(self.pickle_path, list)
     self.mappings = {}
Beispiel #11
0
 def get_features(self):
     print('Get features:', self.feature_extraction_method)
     if self.feature_extraction_method == FeatureExtractionMethod.BOW:
         return get_simple_bag_of_words_features(self.train_corpus,
                                                 self.test_corpus)
     elif self.feature_extraction_method == FeatureExtractionMethod.TF_IDF:
         return get_tf_idf_features(self.train_corpus, self.test_corpus)
     elif self.feature_extraction_method == FeatureExtractionMethod.WORD2VEC:
         if self.should_load_embedding_model:
             print('Loading embedding model from disk')
             self.embedding_model = util.load_object(
                 self.WORD2VEC_MODEL_SAVE_PATH)
         else:
             print('Calculating embeddings')
             self.embedding_model = get_word2vec_trained_model(
                 self.tokenized_test, self.NUM_OF_VEC_FEATURES)
             util.save_object(
                 self.embedding_model,
                 self.CLASSIFIERS_AND_RESULTS_DIR_PATH + 'w2v_model_' +
                 str(self.classifier_iter) + '.pkl')
         return self.get_document_embeddings_from_word2vec()
     elif self.feature_extraction_method == FeatureExtractionMethod.FASTTEXT:
         if self.should_load_embedding_model:
             print('Loading embedding model from disk')
             self.embedding_model = fasttext.load_model(
                 self.FAST_TEXT_SAVE_PATH)
         else:
             print('Calculating embeddings')
             if not os.path.exists(self.TRAIN_DATA_FOR_FASTTEXT_PATH):
                 self.reformat_and_save_data_for_fasttext()
             self.embedding_model = train_fasttext_model(
                 self.TRAIN_DATA_FOR_FASTTEXT_PATH,
                 self.NUM_OF_VEC_FEATURES,
                 epoch=100)
             self.embedding_model.save_model(self.FAST_TEXT_SAVE_PATH)
         return self.get_document_embeddings_from_fasttext()
     else:
         print('No such feature extraction method:',
               self.feature_extraction_method)
Beispiel #12
0
def train(conf):
    train_dir = conf.get("TRAIN", "train_dir")

    model_path = conf.get("NORMAL", "model_path")
    report_dir = conf.get("TRAIN", "report_dir")
    N = conf.getint("TRAIN", "valdata_num")

    feat = load_object(conf.get("NORMAL", 'feat'))

    #crf = sklearn_crfsuite.CRF(
    crf = CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True,
    )

    X, y = zip(*load_corpus(train_dir))

    X_train = feat(X[:-N])
    y_train = y[:-N]

    X_validate = feat(X[-N:])
    y_validate = y[-N:]

    crf.fit(X_train, y_train)

    numpy_pickle.dump(crf, model_path)

    # 性能测试
    y_pred = crf.predict(X_validate)

    labels = list(crf.classes_)
    labels.remove("O")

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
def train_random_forest(training_data_dump):
    training_data = util.load_object(training_data_dump)
    model = RandomForest(training_data, 10, stats.median, 30)
    return model
Beispiel #14
0
tagger = BioTagger()


def read_data(data_dir):
    for fname in os.listdir(data_dir):
        test_path = os.path.join(data_dir, fname)
        with open(test_path) as f:
            text = f.read()
        yield fname, text


if __name__ == "__main__":
    config = get_config()

    feat = load_object(config.get("NORMAL", "feat"))

    test_dir = "data/test"
    result_dir = "data/submit"
    
    crf = numpy_pickle.load('data/models/crf.m')

    for fname, text in read_data(test_dir):
        print(fname)
        sents = [text]
        y = crf.predict(feat(sents))
        anns = tagger.seq_to_ind(y[0])
        anns = sorted(anns, key=lambda x:(x[1],x[2]))
        ann_fname = fname.replace(".txt", ".ann")
        save_path = os.path.join(result_dir, ann_fname)
        with open(save_path, 'w') as f:
Beispiel #15
0
    train_nlp = [tn.spacy_english_model(item) for item in train_corpus]
    util.save_object(
        train_nlp, CLASSIFIERS_AND_RESULTS_DIR_PATH + 'train_nlp_' +
        str(CLASSIFIER_ITERATION) + '.pkl')
    train_glove_features = np.array([item.vector for item in train_nlp])
    print('Test features')
    test_nlp = [tn.spacy_english_model(item) for item in test_corpus]
    util.save_object(
        train_nlp, CLASSIFIERS_AND_RESULTS_DIR_PATH + 'test_nlp_' +
        str(CLASSIFIER_ITERATION) + '.pkl')
    test_glove_features = np.array([item.vector for item in test_nlp])
    return train_glove_features, test_glove_features


# train_glove_features, test_glove_features = read_from_spacy_and_save()
train_glove_features = util.load_object(TRAIN_NLP_PATH)
test_glove_features = util.load_object(TEST_NLP_PATH)
print('GloVe model:> Train features shape:', train_glove_features.shape,
      ' Test features shape:', test_glove_features.shape)

###


def train_sgd():
    svm = SGDClassifier(loss='hinge',
                        penalty='l2',
                        random_state=42,
                        max_iter=500)
    svm.fit(train_glove_features, train_label_names)
    svm_glove_cv_scores = cross_val_score(svm,
                                          train_glove_features,
Beispiel #16
0
def train_svr(training_data_dump):
    training_data = util.load_object(training_data_dump)
    clf = svm.SVR(kernel='rbf')
    clf.fit(training_data[:, :-1], training_data[:, -1])
    return clf
Beispiel #17
0
def predict_svr(clf, test_data_dump):
    test_data = util.load_object(test_data_dump)
    targets = test_data[:, -1]
    predictions = clf.predict(test_data[:, :-1])
    return get_average_kappa(targets, predictions)
Beispiel #18
0
    test_size=0.33,
    random_state=42)
# tokenize corpus
tokenized_train = [tn.tokenizer.tokenize(text) for text in train_corpus]
tokenized_test = [tn.tokenizer.tokenize(text) for text in test_corpus]
# generate word2vec word embeddings

# # build and save word2vec model
w2v_num_features = 1000
# w2v_model = gensim.models.Word2Vec(sentences=tokenized_train, size=w2v_num_features,
#                                    window=100, min_count=2, sample=1e-3, sg=1,
#                                    iter=5, workers=10)
# util.save_object(w2v_model, CLASSIFIERS_AND_RESULTS_DIR_PATH + 'w2v_model' + str(
#     CLASSIFIER_ITERATION) + '.pkl')
# # Load word2vec model
w2v_model = util.load_object(WORD2VEC_MODEL_SAVE_PATH)

# generate document level embeddings
# remember we only use train dataset vocabulary embeddings
# so that test dataset truly remains an unseen dataset
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorize(corpus=tokenized_train,
                                           model=w2v_model,
                                           num_features=w2v_num_features)
avg_wv_test_features = document_vectorize(corpus=tokenized_test,
                                          model=w2v_model,
                                          num_features=w2v_num_features)
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape,
      ' Test features shape:', avg_wv_test_features.shape)

# # pack data in one class
Beispiel #19
0
def plot_model_movie():
    num_test_x = 5
    num_particles_list = [2, 5, 10, 20]
    seed = seed_list[0]
    model_folder = util.get_most_recent_model_folder_args_match(
        seed=seed_list[0],
        train_mode=train_mode_list[0],
        num_particles=num_particles_list[0],
        init_near=init_near)
    args = util.load_object(util.get_args_path(model_folder))
    _, _, true_generative_model = util.init_models(args)
    test_xs = np.linspace(0, 19, num=num_test_x) * 10

    nrows = len(num_particles_list)
    ncols = num_test_x + 1
    width = 5.5
    ax_width = width / ncols
    height = nrows * ax_width
    fig, axss = plt.subplots(nrows, ncols, sharex=True, sharey=True, dpi=300)
    fig.set_size_inches(width, height)

    for num_particles_idx, num_particles in enumerate(num_particles_list):
        axss[num_particles_idx, 0].set_ylabel('$K = {}$'.format(num_particles),
                                              fontsize=SMALL_SIZE)

    handles = [mpatches.Rectangle((0, 0), 1, 1, color='black', label='True')]
    for color, label in zip(colors, labels):
        handles.append(
            mpatches.Rectangle((0, 0), 1, 1, color=color, label=label))
    axss[-1, ncols // 2].legend(bbox_to_anchor=(0, -0.05),
                                loc='upper center',
                                ncol=len(handles),
                                handles=handles)

    axss[0, 0].set_title(r'$p_\theta(z)$')
    for test_x_idx, test_x in enumerate(test_xs):
        axss[0, 1 + test_x_idx].set_title(
            r'$q_\phi(z | x = {0:.0f})$'.format(test_x))
    for ax in axss[-1]:
        ax.set_xlabel(r'$z$', labelpad=0.5)

    for axs in axss:
        for ax in axs:
            ax.set_xticks([])
            ax.set_xticklabels([])
            ax.set_yticks([])
            ax.set_yticklabels([])
            ax.set_ylim(0, 8)
            ax.set_xlim(0, 20)
    # title = fig.suptitle('Iteration 0')
    t = axss[0, ncols // 2].text(0,
                                 1.23,
                                 'Iteration 0',
                                 horizontalalignment='center',
                                 verticalalignment='center',
                                 transform=axss[0, ncols // 2].transAxes,
                                 fontsize=MEDIUM_SIZE)

    fig.tight_layout(pad=0, rect=[0.01, 0.04, 0.99, 0.96])

    def update(frame):
        result = []
        iteration_idx = frame
        iteration = iteration_idx * 1000
        t.set_text('Iteration {}'.format(iteration))
        result.append(t)

        for axs in axss:
            for ax in axs:
                result.append(
                    ax.add_artist(
                        mpatches.Rectangle((0, 0), 20, 8, color='white')))
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            ax = axss[num_particles_idx, 0]

            # true generative model
            i = 0
            plot_hinton(ax,
                        true_generative_model.get_latent_params().data.numpy(),
                        8 - i,
                        8 - i - 1,
                        0,
                        20,
                        color='black')

            # learned generative models
            for train_mode_idx, train_mode in enumerate(train_mode_list):
                label = labels[train_mode_idx]
                color = colors[train_mode_idx]
                model_folder = util.get_most_recent_model_folder_args_match(
                    seed=seed,
                    train_mode=train_mode,
                    num_particles=num_particles,
                    init_near=init_near)
                if model_folder is not None:
                    generative_model, _ = util.load_models(model_folder,
                                                           iteration=iteration)
                    if generative_model is not None:
                        plot_hinton(
                            ax,
                            generative_model.get_latent_params().data.numpy(),
                            8 - train_mode_idx - 1,
                            8 - train_mode_idx - 2,
                            0,
                            20,
                            label=label,
                            color=color)

            result += ax.artists

            # inference network
            for test_x_idx, test_x in enumerate(test_xs):
                ax = axss[num_particles_idx, test_x_idx + 1]
                test_x_tensor = torch.tensor(test_x,
                                             dtype=torch.float,
                                             device=args.device).unsqueeze(0)

                # true
                plot_hinton(ax,
                            true_generative_model.get_posterior_probs(
                                test_x_tensor)[0].data.numpy(),
                            8 - i,
                            8 - i - 1,
                            0,
                            20,
                            color='black')

                # learned
                for train_mode_idx, train_mode in enumerate(train_mode_list):
                    label = labels[train_mode_idx]
                    color = colors[train_mode_idx]
                    model_folder = \
                        util.get_most_recent_model_folder_args_match(
                            seed=seed, train_mode=train_mode,
                            num_particles=num_particles, init_near=init_near)
                    if model_folder is not None:
                        _, inference_network = util.load_models(
                            model_folder, iteration=iteration)
                        if inference_network is not None:
                            plot_hinton(ax,
                                        inference_network.get_latent_params(
                                            test_x_tensor)[0].data.numpy(),
                                        8 - train_mode_idx - 1,
                                        8 - train_mode_idx - 2,
                                        0,
                                        20,
                                        label=label,
                                        color=color)
                result += ax.artists
        return result

    anim = FuncAnimation(fig, update, frames=np.arange(100), blit=True)
    if not os.path.exists('./plots/'):
        os.makedirs('./plots/')
    filename = './plots/model_movie.mp4'
    anim.save(filename, dpi=300)
    print('Saved to {}'.format(filename))
Beispiel #20
0
def plot_models():
    saving_iterations = np.arange(100) * 1000
    num_iterations_to_plot = 3
    iterations_to_plot = saving_iterations[np.floor(
        np.linspace(0, 99, num=num_iterations_to_plot)).astype(int)]
    num_test_x = 3
    num_particles_list = [2, 20]
    seed = seed_list[0]
    model_folder = util.get_most_recent_model_folder_args_match(
        seed=seed_list[0],
        train_mode=train_mode_list[0],
        num_particles=num_particles_list[0],
        init_near=init_near)
    args = util.load_object(util.get_args_path(model_folder))
    _, _, true_generative_model = util.init_models(args)
    test_xs = np.linspace(0, 19, num=num_test_x) * 10

    nrows = num_iterations_to_plot
    ncols = len(num_particles_list) * (num_test_x + 1)
    fig, axss = plt.subplots(nrows, ncols, sharex=True, sharey=True)
    width = 5.5
    ax_width = width / ncols
    height = nrows * ax_width
    fig.set_size_inches(width, height)
    for iteration_idx, iteration in enumerate(iterations_to_plot):
        axss[iteration_idx, 0].set_ylabel('Iter. {}'.format(iteration))
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            ax = axss[iteration_idx, num_particles_idx * (num_test_x + 1)]
            ax.set_xticks([])
            ax.set_xticklabels([])
            ax.set_yticks([])
            ax.set_yticklabels([])
            ax.set_ylim(0, 8)
            ax.set_xlim(0, 20)
            if iteration_idx == 0:
                ax.set_title(r'$p_\theta(z)$')

            # true generative model
            i = 0
            plot_hinton(ax,
                        true_generative_model.get_latent_params().data.numpy(),
                        8 - i,
                        8 - i - 1,
                        0,
                        20,
                        color='black')

            # learned generative models
            for train_mode_idx, train_mode in enumerate(train_mode_list):
                label = labels[train_mode_idx]
                color = colors[train_mode_idx]
                model_folder = util.get_most_recent_model_folder_args_match(
                    seed=seed,
                    train_mode=train_mode,
                    num_particles=num_particles,
                    init_near=init_near)
                if model_folder is not None:
                    generative_model, _ = util.load_models(model_folder,
                                                           iteration=iteration)
                    if generative_model is not None:
                        plot_hinton(
                            ax,
                            generative_model.get_latent_params().data.numpy(),
                            8 - train_mode_idx - 1,
                            8 - train_mode_idx - 2,
                            0,
                            20,
                            label=label,
                            color=color)

            # inference network
            for test_x_idx, test_x in enumerate(test_xs):
                ax = axss[iteration_idx, num_particles_idx * (num_test_x + 1) +
                          test_x_idx + 1]
                ax.set_xticks([])
                ax.set_xticklabels([])
                ax.set_yticks([])
                ax.set_yticklabels([])
                ax.set_ylim(0, 8)
                ax.set_xlim(0, 20)
                test_x_tensor = torch.tensor(test_x,
                                             dtype=torch.float,
                                             device=args.device).unsqueeze(0)
                if iteration_idx == 0:
                    ax.set_title(r'$q_\phi(z | x = {0:.0f})$'.format(test_x))

                # true
                plot_hinton(ax,
                            true_generative_model.get_posterior_probs(
                                test_x_tensor)[0].data.numpy(),
                            8 - i,
                            8 - i - 1,
                            0,
                            20,
                            color='black')

                # learned
                for train_mode_idx, train_mode in enumerate(train_mode_list):
                    label = labels[train_mode_idx]
                    color = colors[train_mode_idx]
                    model_folder = \
                        util.get_most_recent_model_folder_args_match(
                            seed=seed, train_mode=train_mode,
                            num_particles=num_particles, init_near=init_near)
                    if model_folder is not None:
                        _, inference_network = util.load_models(
                            model_folder, iteration=iteration)
                        if inference_network is not None:
                            plot_hinton(ax,
                                        inference_network.get_latent_params(
                                            test_x_tensor)[0].data.numpy(),
                                        8 - train_mode_idx - 1,
                                        8 - train_mode_idx - 2,
                                        0,
                                        20,
                                        label=label,
                                        color=color)

    for num_particles_idx, num_particles in enumerate(num_particles_list):
        ax = axss[0,
                  num_particles_idx * (num_test_x + 1) + (num_test_x + 1) // 2]
        ax.text(0,
                1.25,
                '$K = {}$'.format(num_particles),
                fontsize=SMALL_SIZE,
                verticalalignment='bottom',
                horizontalalignment='center',
                transform=ax.transAxes)

    handles = [mpatches.Rectangle((0, 0), 1, 1, color='black', label='True')]
    for color, label in zip(colors, labels):
        handles.append(
            mpatches.Rectangle((0, 0), 1, 1, color=color, label=label))
    axss[-1, ncols // 2].legend(bbox_to_anchor=(0, -0.1),
                                loc='upper center',
                                ncol=len(handles),
                                handles=handles)

    for ax in axss[-1]:
        ax.set_xlabel(r'$z$', labelpad=0.5)

    fig.tight_layout(pad=0)
    if not os.path.exists('./plots/'):
        os.makedirs('./plots/')
    filename = './plots/models.pdf'
    fig.savefig(filename, bbox_inches='tight')
    print('Saved to {}'.format(filename))
 def load_fileds(self,path):
     self.CHAR,self.WORD = util.load_object(path,"pkl")
def train_sklearn_random_forest(training_data_dump):
    training_data = util.load_object(training_data_dump)
    model = RandomForestRegressor(n_estimators=10)
    model = model.fit(training_data[:, :-1], training_data[:, -1])
    return model
Beispiel #23
0
def bulid_candidate_words(data, stop_nfile, candidate_save_path, candidata_pos={}, first_sentence_count=30, last_sentence_count=20):
    # ID 标题 文本内容
    stop_words = util.stopwordslist(stop_nfile)
    # load corpus and model
    corpus_dict = util.load_object(corpora_dict_path)
    corpus = corpora.MmCorpus(corpus_path)
    tfidf_model = models.TfidfModel.load(tfidf_path)
    lda_model = models.LdaModel.load(lda_path)
    lsi_model = models.LsiModel.load(lsi_path)

    candidate_words = []
    for index, row in data.iterrows():
        title = str(row['title']).strip()
        doc = str(row['doc']).strip()
        candidate_word = {} # 该行记录的候选词key为word,value为id对应的特征(选择的10个特征)
        # doc
        words_doc = list(pseg.cut(doc, HMM=True)) #[(word, flag)]
        # title
        words_title = list(pseg.cut(title, HMM=True))

        # 去除停用词
        words_doc = [(word, pos) for word,pos in words_doc if word not in stop_words]
        words_title = [(word, pos) for word,pos in words_title if word not in stop_words]

        doc_len = len(words_doc)  # 统计去除停用词后的doc长度
        title_len = len(words_title)
        for word_index,(word,pos) in enumerate(words_doc):
            if pos in candidata_pos and len(word) > 1:
                # 特征的最后三项分别:features[-3]doc长度,features[-2]纪录候选词的首次出现位置,features[-1]最后一次出现的位置
                if word in candidate_word:
                    word_features = candidate_word[word]
                    word_features[-1] = (word_index+1)
                    candidate_word[word] = word_features
                    continue
                else:
                    features = [0] * 14
                    features[-3] = doc_len
                    # feature 1 词性
                    features[0] = candidata_pos[pos]
                    # feature 2 候选词首次出现的位置
                    if doc_len == 0:
                        firoc = 0.
                    else:
                        firoc = (word_index+1)/float(doc_len)
                    features[1] = firoc
                    features[-2] = (word_index+1) # 首次出现的位置
                    # feature 3 候选词的长度
                    features[2] = len(word)
                    # feature 4 候选词为的字符都是数字或者字母组成
                    if util.is_contain_char_num(word):
                        features[3] = 1
                    # feature 5 候选词对应的tfidf
                    id = corpus_dict.token2id.get(word, len(corpus_dict.token2id)+1)
                    if id == len(corpus_dict.token2id)+1:
                        features[4] = 1e-8
                    else:
                        for (w_id, tfidf) in tfidf_model[corpus[index]]:
                            if id == w_id:
                                features[4] = tfidf
                                break
                    # feature 6 第一句中候选词出现的次数
                    first_sentence = words_doc[:first_sentence_count]
                    features[5] = util.get_count_sentence(word,first_sentence)
                    # feature 7 最后一句中候选词出现的次数[-20:]
                    last_sentence = words_doc[-last_sentence_count:]
                    features[6] = util.get_count_sentence(word,last_sentence)
                    # feature 8,9 LDA,LSI:候选词的主题分布与文档的主题分布的相似度
                    single_list = [word]
                    word_corpus = tfidf_model[corpus_dict.doc2bow(single_list)]
                    features[7] = get_topic_sim(lda_model,word_corpus,corpus[index])
                    features[8] = get_topic_sim(lsi_model,word_corpus,corpus[index])
                    # feature 11 词跨度长度由的首次出现位置和最后一次出现的位置和doc长度计算
                    candidate_word[word] = features

        for word_index, (word, pos) in enumerate(words_title):
            if pos in candidata_pos and len(word) > 1:
                if word in candidate_word:
                    word_features = candidate_word[word]
                    # feature 10 是否出现在标题中
                    word_features[9] = 1
                    candidate_word[word] = word_features
                else:
                    features = [0] * 14
                    features[-3] = title_len
                    # feature 1 词性
                    features[0] = candidata_pos[pos]
                    # feature 2 候选词首次出现的位置
                    if title_len == 0:
                        firoc = 0.
                    else:
                        firoc = (word_index + 1) / float(title_len)
                    features[1] = firoc
                    features[-2] = (word_index + 1)  # 首次出现的位置
                    # feature 3 候选词的长度
                    features[2] = len(word)
                    # feature 4 候选词为的字符都是数字或者字母组成
                    if util.is_contain_char_num(word):
                        features[3] = 1
                    # feature 5 候选词对应的tfidf
                    id = corpus_dict.token2id.get(word, len(corpus_dict.token2id) + 1)
                    if id == len(corpus_dict.token2id) + 1:
                        features[4] = 1e-8
                    else:
                        for (w_id, tfidf) in tfidf_model[corpus[index]]:
                            if id == w_id:
                                features[4] = tfidf
                                break
                    # feature 6 第一句中候选词出现的次数
                    first_sentence = words_doc[:first_sentence_count]
                    features[5] = util.get_count_sentence(word, first_sentence)
                    # feature 7 最后一句中候选词出现的次数[-20:]
                    last_sentence = words_doc[-last_sentence_count:]
                    features[6] = util.get_count_sentence(word, last_sentence)
                    # feature 8,9 LDA,LSI:候选词的主题分布与文档的主题分布的相似度
                    single_list = [word]
                    word_corpus = tfidf_model[corpus_dict.doc2bow(single_list)]
                    features[7] = get_topic_sim(lda_model, word_corpus, corpus[index])
                    features[8] = get_topic_sim(lsi_model, word_corpus, corpus[index])
                    # feature 10 是否出现在标题中
                    features[9] = 1
                    # feature 11 词跨度长度由的首次出现位置和最后一次出现的位置和doc长度计算
                    candidate_word[word] = features
        candidate_words.append(candidate_word)
        # save
        if index % 2000 == 0:
            print('deal with sentence %d' % index)

    # data['candidate_words'] = candidate_words
    # data.to_csv(data_candidate_path, sep='\001', header=None, index=None)
    util.save_object(candidate_words,candidate_save_path)
Beispiel #24
0
from topic_classification.feature_extraction_utils import \
    document_vectorize, document_vectorize_with_fasttext_model
from topic_classification.dataset_utils import load_20newsgroups

import util
import topic_classification.experiment_config as experiment_config

from topic_classification.constants import TOPIC_CLASSIFICATION_DATA_PATH
import tensorflow_hub as hub
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

data_df = load_20newsgroups()
data_string = util.load_object(TOPIC_CLASSIFICATION_DATA_PATH +
                               '20_newsgroups_one_string.txt')
data_word_list = data_string.split(' ')
vocabulary = set(data_word_list)

train_corpus, test_corpus, train_label_names, \
test_label_names = train_test_split(np.array(data_df['Clean Article']),
                                    np.array(data_df['Target Name']),
                                    test_size=0.33, random_state=42)
# tokenize corpus
tokenized_train = [tn.tokenizer.tokenize(text) for text in train_corpus]
tokenized_test = [tn.tokenizer.tokenize(text) for text in test_corpus]

# # # Feature extraction
# elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
elmo = hub.KerasLayer("https://hub.tensorflow.google.cn/google/elmo/3",
                      trainable=True,
Beispiel #25
0
 def load_from_file(self, filename):
     self.tt = util.load_object(filename)
     assert isinstance(self.tt, Trie)
Beispiel #26
0
def train_and_save(classifier_list, classifier_name_list, training_data):
    results = train_multiple_classifiers(classifier_list, classifier_name_list,
                                         training_data)
    util.save_object(results, RESULTS_PATH)
    util.save_classifier_list(classifier_list, classifier_name_list,
                              CLASSIFIERS_AND_RESULTS_DIR_PATH)
    return results


# Train and save on disk
# results = train_and_save(classifier_list, classifier_name_list, training_data)
# #Load from disk
classifier_list = util.load_classifier_list(classifier_name_list,
                                            CLASSIFIERS_AND_RESULTS_DIR_PATH)
results = util.load_object(RESULTS_PATH)

# results[0] = array of crossvalidation, [1] crossvalidation scores,
# [2] test score, [3] times
# # Plotting
cv_mean_scores = [round(result[1], SCORE_DECIMAL_PLACES) for result in results]
test_scores = [round(result[2], SCORE_DECIMAL_PLACES) for result in results]
elapsed_times = [round(result[3], TIME_DECIMAL_PLACES) for result in results]
# create_bar_plot(classifier_name_shortcut_list, 'Classifier scores', 'Accuracy',
#                 cv_mean_scores, y_range_tuple=(0, 1))
create_2_bar_plot(classifier_name_shortcut_list,
                  'Classifier scores',
                  'Accuracy',
                  cv_mean_scores,
                  test_scores,
                  'cv means',
Beispiel #27
0
Datei: plot.py Projekt: yyht/rrws
def main(args):
    if args.mode == 'efficiency':
        num_runs = 10
        num_particles_list = [2, 5, 10, 50, 100, 500, 1000, 5000]
        num_partitions_list = [2, 5, 10, 50, 100, 500, 1000]
        path = './save/efficiency.pkl'
        (memory_thermo, time_thermo, memory_vimco, time_vimco,
         memory_reinforce, time_reinforce) = util.load_object(path)

        fig, axs = plt.subplots(1, 2, dpi=200, figsize=(6, 4))
        # colors = ['C0', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8']
        norm = matplotlib.colors.Normalize(vmin=0,
                                           vmax=len(num_particles_list))
        cmap = matplotlib.cm.ScalarMappable(norm=norm,
                                            cmap=matplotlib.cm.Blues)
        cmap.set_array([])
        colors = [cmap.to_rgba(i + 1) for i in range(len(num_particles_list))]

        for i, num_partitions in enumerate(num_partitions_list):
            axs[0].plot(num_particles_list,
                        np.mean(time_thermo[:, i], axis=-1),
                        label='thermo K={}'.format(num_partitions),
                        color=colors[i],
                        marker='x',
                        linestyle='none')
        axs[0].plot(num_particles_list,
                    np.mean(time_vimco, axis=-1),
                    color='black',
                    label='vimco',
                    marker='o',
                    linestyle='none',
                    fillstyle='none')
        axs[0].plot(num_particles_list,
                    np.mean(time_reinforce, axis=-1),
                    color='black',
                    label='reinforce',
                    marker='v',
                    linestyle='none',
                    fillstyle='none')

        axs[0].set_xscale('log')
        axs[0].set_yscale('log')
        axs[0].set_xlabel('number of particles')
        axs[0].set_ylabel('time (seconds)')
        axs[0].grid(True)
        axs[0].grid(True, which='minor', linewidth=0.2)
        # axs[0].legend(bbox_to_anchor=(1.13, -0.19), loc='upper center', ncol=3)
        sns.despine(ax=axs[0])

        # colors = ['C0', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8']
        for i, num_partitions in enumerate(num_partitions_list):
            axs[1].plot(num_particles_list,
                        np.mean(memory_thermo[:, i] / 1e6, axis=-1),
                        label='thermo K={}'.format(num_partitions),
                        color=colors[i],
                        marker='x',
                        linestyle='none')
        axs[1].plot(num_particles_list,
                    np.mean(memory_vimco / 1e6, axis=-1),
                    color='black',
                    label='vimco',
                    marker='o',
                    linestyle='none',
                    fillstyle='none')
        axs[1].plot(num_particles_list,
                    np.mean(memory_reinforce / 1e6, axis=-1),
                    color='black',
                    label='reinforce',
                    marker='v',
                    linestyle='none',
                    fillstyle='none')

        axs[1].set_xscale('log')
        axs[1].set_yscale('log')
        axs[1].set_xlabel('number of particles')
        axs[1].set_ylabel('memory (MB)')
        axs[-1].legend(fontsize=6, ncol=2)
        axs[1].grid(True)
        axs[1].grid(True, which='minor', linewidth=0.2)
        sns.despine(ax=axs[1])

        fig.tight_layout()
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
        filename = './plots/efficiency.pdf'
        fig.savefig(filename, bbox_inches='tight')
        print('saved to {}'.format(filename))
    elif args.mode == 'insights':
        markersize = 3
        learning_rate = 3e-4
        architecture = 'linear_3'
        seed = 8
        train_mode = 'thermo'
        num_particles_list = [2, 5, 10, 50]
        num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        log_beta_mins_1 = [-10, -1, -0.045757490560675115]
        log_beta_mins_2 = [
            -5, -2, -1.6989700043360187, -1.5228787452803376,
            -1.3979400086720375, -1.3010299956639813, -1.2218487496163564,
            -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1,
            -0.6989700043360187, -0.5228787452803375, -0.3979400086720376,
            -0.3010299956639812, -0.2218487496163564, -0.15490195998574313,
            -0.09691001300805639, -0.045757490560675115
        ]
        num_iterations = 400

        log_p_thermo_partition_sweep = np.full(
            (len(num_particles_list), len(log_beta_mins_1),
             len(num_partitions_list), num_iterations), np.nan)
        log_p_thermo_beta_sweep = np.full(
            (len(num_particles_list), len(log_beta_mins_2), num_iterations),
            np.nan)

        for num_particles_idx, num_particles in enumerate(num_particles_list):
            for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_1):
                for num_partitions_idx, num_partitions in enumerate(
                        num_partitions_list):
                    dir_ = util.get_most_recent_dir_args_match(
                        train_mode=train_mode,
                        architecture=architecture,
                        learning_rate=learning_rate,
                        num_particles=num_particles,
                        num_partitions=num_partitions,
                        log_beta_min=log_beta_min,
                        seed=seed)
                    if dir_ is not None:
                        stats = util.load_object(util.get_stats_path(dir_))
                        log_p_thermo_partition_sweep[
                            num_particles_idx, log_beta_min_idx,
                            num_partitions_idx] = stats.log_p_history[:
                                                                      num_iterations]

                        print('thermo {} ({} partitions) beta_min = 1e{} after'
                              ' {} it: {}'.format(num_particles,
                                                  num_partitions, log_beta_min,
                                                  len(stats.log_p_history),
                                                  stats.log_p_history[-1]))
                    else:
                        print('missing')

            num_partitions = 2
            for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_thermo_beta_sweep[
                        num_particles_idx,
                        log_beta_min_idx] = stats.log_p_history[:
                                                                num_iterations]
                    print('thermo {} ({} partitions) beta_min = 1e{} after {}'
                          ' it: {}'.format(num_particles, num_partitions,
                                           log_beta_min,
                                           len(stats.log_p_history),
                                           stats.log_p_history[-1]))
                else:
                    print('missing')

        fig, axs = plt.subplots(2, 2, dpi=200, figsize=(12, 7), sharey=True)

        for log_beta_min_idx, ax in zip(range(len(log_beta_mins_1)),
                                        [axs[0, 0], axs[0, 1], axs[1, 0]]):
            colors = ['C1', 'C2', 'C4', 'C5']
            # ax = axs[log_beta_min_idx]
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                ax.plot(num_partitions_list,
                        log_p_thermo_partition_sweep[num_particles_idx,
                                                     log_beta_min_idx, :, -1],
                        color=colors[num_particles_idx],
                        label=num_particles,
                        marker='o',
                        markersize=markersize,
                        linestyle='solid',
                        linewidth=0.7)
            ax.set_title(r'$\beta_1 = {:.0e}$'.format(
                10**log_beta_mins_1[log_beta_min_idx]))
            # ax.set_xticks(np.arange(len(num_partitions_list)))
            # ax.set_xticklabels(num_partitions_list)
            ax.set_xlabel('number of partitions')
            ax.set_xticks(np.arange(0, max(num_partitions_list) + 1, 10))

        ax = axs[1, 1]
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            ax.plot(10**np.array(log_beta_mins_2),
                    log_p_thermo_beta_sweep[num_particles_idx, :, -1],
                    color=colors[num_particles_idx],
                    label=num_particles,
                    marker='o',
                    markersize=markersize,
                    linestyle='solid',
                    linewidth=0.7)
        ax.set_xticks(np.arange(0, 1.1, 0.2))
        ax.set_title('2 partitions')
        ax.set_xlabel(r'$\beta_1$')

        print(np.max(log_p_thermo_beta_sweep[..., -1], axis=-1))
        print(np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1))
        print([
            log_beta_mins_2[i]
            for i in np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1)
        ])
        print([
            10**log_beta_mins_2[i]
            for i in np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1)
        ])
        # print(log_beta_mins_2[np.argmax(log_p_thermo_beta_sweep[..., -1], axis=-1)])

        for axx in axs:
            for ax in axx:
                ax.grid(True, axis='y')

        for ax in axs[:, 0]:
            ax.set_ylim(top=-88)
            ax.set_ylabel(r'$\log p(x)$')

        axs[1, 1].legend(title='number of particles',
                         ncol=2,
                         loc='lower right')

        for axx in axs:
            for ax in axx:
                sns.despine(ax=ax, trim=True)

        # ax.('thermo')
        fig.tight_layout()
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
        filename = './plots/insights.pdf'
        fig.savefig(filename, bbox_inches='tight')
        print('saved to {}'.format(filename))
    elif args.mode == 'baselines':
        learning_rate = 3e-4
        architecture = 'linear_3'
        seed = 8
        non_thermo_train_modes = ['ww', 'vimco']
        num_particles_list = [2, 5, 10, 50]
        num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        # log_beta_mins_1 = [-10, -1, -0.045757490560675115]
        log_beta_mins_2 = [
            -5, -2, -1.6989700043360187, -1.5228787452803376,
            -1.3979400086720375, -1.3010299956639813, -1.2218487496163564,
            -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1,
            -0.6989700043360187, -0.5228787452803375, -0.3979400086720376,
            -0.3010299956639812, -0.2218487496163564, -0.15490195998574313,
            -0.09691001300805639, -0.045757490560675115
        ]
        num_iterations = 400
        log_p_thermo_beta_sweep = np.full(
            (len(num_particles_list), len(log_beta_mins_2), num_iterations),
            np.nan)

        log_p_non_thermo = np.full((len(non_thermo_train_modes),
                                    len(num_particles_list), num_iterations),
                                   np.nan)

        train_mode = 'thermo'
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            num_partitions = 2
            for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_thermo_beta_sweep[
                        num_particles_idx,
                        log_beta_min_idx] = stats.log_p_history
                    print('thermo {} ({} partitions) beta_min = 1e{} after {}'
                          ' it: {}'.format(num_particles, num_partitions,
                                           log_beta_min,
                                           len(stats.log_p_history),
                                           stats.log_p_history[-1]))
                else:
                    print('missing')

        seed = 7
        log_beta_min = -10
        learning_rate = 3e-4
        num_partitions = 1
        for train_mode_idx, train_mode in enumerate(non_thermo_train_modes):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_non_thermo[train_mode_idx, num_particles_idx, :len(
                        stats.log_p_history)] = stats.log_p_history

                    print('{} {} after {} it: {}'.format(
                        train_mode, num_particles, len(stats.log_p_history),
                        stats.log_p_history[-1]))
                else:
                    print('missing')

        fig, ax = plt.subplots(1, 1, dpi=200, figsize=(6, 4))

        colors = ['C1', 'C2', 'C4', 'C5']
        linestyles = ['dashed', 'dotted']
        for train_mode_idx, train_mode in enumerate(non_thermo_train_modes):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                if train_mode == 'ww':
                    label = 'rws'
                else:
                    label = train_mode
                ax.plot(log_p_non_thermo[train_mode_idx, num_particles_idx],
                        linestyle=linestyles[train_mode_idx],
                        color=colors[num_particles_idx],
                        label='{} {} ({:.2f})'.format(
                            label, num_particles,
                            log_p_non_thermo[train_mode_idx, num_particles_idx,
                                             -1]))

        # best_num_particles_idx = 3
        # best_beta_idxs = [4, 5, 11]
        # best_beta_idxs = [0, 4, 7, 11]
        best_beta_idxs = [18, 5, 11, 12]
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            best_beta_idx = best_beta_idxs[num_particles_idx]
            color = colors[num_particles_idx]
            ax.plot(
                log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx],
                linestyle='solid',
                color=color,
                label='thermo S={}, K={}, $\\beta_1$={:.0e} ({:.2f})'.format(
                    num_particles_list[num_particles_idx], 2,
                    10**(log_beta_mins_2[best_beta_idx]),
                    log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx,
                                            -1]))

        ax.set_ylim(-110)
        ax.grid(True, axis='y', linewidth=0.2)
        ax.legend(fontsize=6, ncol=3, frameon=False)
        ax.set_ylabel(r'$\log p(x)$')
        ax.set_xlabel('iteration')
        ax.xaxis.set_label_coords(0.5, -0.025)
        ax.set_xticks([0, num_iterations])
        ax.set_xticklabels([0, '4e6'])
        sns.despine(ax=ax, trim=True)

        fig.tight_layout()
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
        filename = './plots/baselines.pdf'
        fig.savefig(filename, bbox_inches='tight')
        print('saved to {}'.format(filename))
    elif args.mode == 'grad_std':
        learning_rate = 3e-4
        architecture = 'linear_3'
        seed = 8
        non_thermo_train_modes = ['ww', 'vimco']
        num_particles_list = [2, 5, 10, 50]
        num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        log_beta_mins_1 = [-10, -1, -0.045757490560675115]
        log_beta_mins_2 = [
            -5, -2, -1.6989700043360187, -1.5228787452803376,
            -1.3979400086720375, -1.3010299956639813, -1.2218487496163564,
            -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1,
            -0.6989700043360187, -0.5228787452803375, -0.3979400086720376,
            -0.3010299956639812, -0.2218487496163564, -0.15490195998574313,
            -0.09691001300805639, -0.045757490560675115
        ]
        num_iterations = 400
        log_p_thermo_beta_sweep = np.full(
            (len(num_particles_list), len(log_beta_mins_2), num_iterations),
            np.nan)

        log_p_non_thermo = np.full((len(non_thermo_train_modes),
                                    len(num_particles_list), num_iterations),
                                   np.nan)

        train_mode = 'thermo'
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            num_partitions = 2
            for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_thermo_beta_sweep[
                        num_particles_idx,
                        log_beta_min_idx] = stats.grad_std_history
                    print('thermo {} ({} partitions) beta_min = 1e{} after {}'
                          ' it: {}'.format(num_particles, num_partitions,
                                           log_beta_min,
                                           len(stats.log_p_history),
                                           stats.log_p_history[-1]))
                else:
                    print('missing')

        seed = 7
        log_beta_min = -10
        learning_rate = 3e-4
        num_partitions = 1
        for train_mode_idx, train_mode in enumerate(non_thermo_train_modes):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_non_thermo[train_mode_idx, num_particles_idx, :len(
                        stats.log_p_history)] = stats.grad_std_history

                    print('{} {} after {} it: {}'.format(
                        train_mode, num_particles, len(stats.log_p_history),
                        stats.log_p_history[-1]))
                else:
                    print('missing')

        fig, ax = plt.subplots(1, 1, dpi=200, figsize=(6, 4))

        colors = ['C1', 'C2', 'C4', 'C5']
        linestyles = ['dashed', 'dotted']
        for train_mode_idx, train_mode in enumerate(non_thermo_train_modes):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                if train_mode == 'ww':
                    label = 'rws'
                else:
                    label = train_mode
                ax.plot(log_p_non_thermo[train_mode_idx, num_particles_idx],
                        linestyle=linestyles[train_mode_idx],
                        color=colors[num_particles_idx],
                        label='{} {} ({:.2f})'.format(
                            label, num_particles,
                            log_p_non_thermo[train_mode_idx, num_particles_idx,
                                             -1]))

        # best_num_particles_idx = 3
        # best_beta_idxs = [4, 5, 11]
        # best_beta_idxs = [0, 4, 5, 11]
        best_beta_idxs = [18, 5, 11, 12]
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            best_beta_idx = best_beta_idxs[num_particles_idx]
            color = colors[num_particles_idx]
            ax.plot(
                log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx],
                linestyle='solid',
                color=color,
                label='thermo S={}, K={}, $\\beta_1$={:.0e} ({:.2f})'.format(
                    num_particles_list[num_particles_idx], 2,
                    10**(log_beta_mins_2[best_beta_idx]),
                    log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx,
                                            -1]))

        ax.set_ylim(0, 20)
        ax.grid(True, axis='y', linewidth=0.2)
        ax.legend(fontsize=6, ncol=3, frameon=False)
        ax.set_ylabel(r'grad std')
        ax.set_xlabel('iteration')
        ax.xaxis.set_label_coords(0.5, -0.025)
        ax.set_xticks([0, num_iterations])
        ax.set_xticklabels([0, '4e6'])
        sns.despine(ax=ax, trim=True)

        fig.tight_layout()
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
        filename = './plots/grad_std.pdf'
        fig.savefig(filename, bbox_inches='tight')
        print('saved to {}'.format(filename))
    elif args.mode == 'baselines_kl':
        learning_rate = 3e-4
        architecture = 'linear_3'
        seed = 8
        non_thermo_train_modes = ['ww', 'vimco']
        num_particles_list = [2, 5, 10, 50]
        num_partitions_list = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        # log_beta_mins_1 = [-10, -1, -0.045757490560675115]
        log_beta_mins_2 = [
            -5, -2, -1.6989700043360187, -1.5228787452803376,
            -1.3979400086720375, -1.3010299956639813, -1.2218487496163564,
            -1.1549019599857433, -1.0969100130080565, -1.0457574905606752, -1,
            -0.6989700043360187, -0.5228787452803375, -0.3979400086720376,
            -0.3010299956639812, -0.2218487496163564, -0.15490195998574313,
            -0.09691001300805639, -0.045757490560675115
        ]
        num_iterations = 400
        log_p_thermo_beta_sweep = np.full(
            (len(num_particles_list), len(log_beta_mins_2), num_iterations),
            np.nan)

        log_p_non_thermo = np.full((len(non_thermo_train_modes),
                                    len(num_particles_list), num_iterations),
                                   np.nan)

        train_mode = 'thermo'
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            num_partitions = 2
            for log_beta_min_idx, log_beta_min in enumerate(log_beta_mins_2):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_thermo_beta_sweep[
                        num_particles_idx, log_beta_min_idx] = stats.kl_history
                    print('thermo {} ({} partitions) beta_min = 1e{} after {}'
                          ' it: {}'.format(num_particles, num_partitions,
                                           log_beta_min,
                                           len(stats.log_p_history),
                                           stats.log_p_history[-1]))
                else:
                    print('missing')

        seed = 7
        log_beta_min = -10
        learning_rate = 3e-4
        num_partitions = 1
        for train_mode_idx, train_mode in enumerate(non_thermo_train_modes):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                dir_ = util.get_most_recent_dir_args_match(
                    train_mode=train_mode,
                    architecture=architecture,
                    learning_rate=learning_rate,
                    num_particles=num_particles,
                    num_partitions=num_partitions,
                    log_beta_min=log_beta_min,
                    seed=seed)
                if dir_ is not None:
                    stats = util.load_object(util.get_stats_path(dir_))
                    log_p_non_thermo[train_mode_idx, num_particles_idx, :len(
                        stats.kl_history)] = stats.kl_history

                    print('{} {} after {} it: {}'.format(
                        train_mode, num_particles, len(stats.log_p_history),
                        stats.log_p_history[-1]))
                else:
                    print('missing')

        fig, ax = plt.subplots(1, 1, dpi=200, figsize=(6, 4))

        colors = ['C1', 'C2', 'C4', 'C5']
        linestyles = ['dashed', 'dotted']
        for train_mode_idx, train_mode in enumerate(non_thermo_train_modes):
            for num_particles_idx, num_particles in enumerate(
                    num_particles_list):
                if train_mode == 'ww':
                    label = 'rws'
                else:
                    label = train_mode
                ax.plot(log_p_non_thermo[train_mode_idx, num_particles_idx],
                        linestyle=linestyles[train_mode_idx],
                        color=colors[num_particles_idx],
                        label='{} {} ({:.2f})'.format(
                            label, num_particles,
                            log_p_non_thermo[train_mode_idx, num_particles_idx,
                                             -1]))

        # best_num_particles_idx = 3
        # best_beta_idxs = [4, 5, 11]
        # best_beta_idxs = [0, 4, 5, 11]
        best_beta_idxs = [18, 5, 11, 12]
        for num_particles_idx, num_particles in enumerate(num_particles_list):
            best_beta_idx = best_beta_idxs[num_particles_idx]
            color = colors[num_particles_idx]
            ax.plot(
                log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx],
                linestyle='solid',
                color=color,
                label='thermo S={}, K={}, $\\beta_1$={:.0e} ({:.2f})'.format(
                    num_particles_list[num_particles_idx], 2,
                    10**(log_beta_mins_2[best_beta_idx]),
                    log_p_thermo_beta_sweep[num_particles_idx, best_beta_idx,
                                            -1]))

        ax.set_ylim(5, 20)
        ax.grid(True, axis='y', linewidth=0.2)
        ax.legend(fontsize=6, ncol=3, frameon=False)
        ax.set_ylabel(r'KL(q || p)')
        ax.set_xlabel('iteration')
        ax.xaxis.set_label_coords(0.5, -0.025)
        ax.set_xticks([0, num_iterations])
        ax.set_xticklabels([0, '4e6'])
        sns.despine(ax=ax, trim=True)

        fig.tight_layout()
        if not os.path.exists('./plots/'):
            os.makedirs('./plots/')
        filename = './plots/baselines_kl.pdf'
        fig.savefig(filename, bbox_inches='tight')
        print('saved to {}'.format(filename))
Beispiel #28
0
def source_event_counter(enrollment_set, base_date):
    """
    Counts the source-event pairs.

    Features
    --------
    """
    X_pkl_path = util.cache_path('source_event_counter_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('source_event_counter')
    logger.debug('preparing datasets')

    Enroll_all = util.load_enrollments()

    pkl_path = util.cache_path('Log_all_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        Log = util.fetch(pkl_path)
    else:
        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log['source_event'] = Log['source'] + '-' + Log['event']
        Log['day_diff'] = (base_date - Log['time']).dt.days
        Log['week_diff'] = Log['day_diff'] // 7
        Log['event_count'] = 1

        util.dump(Log, pkl_path)

    Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\
        .agg({'event_count': np.sum}).reset_index()

    logger.debug('datasets prepared')

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('event_count_by_eid_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        event_count_by_eid = util.fetch(pkl_path)
    else:
        params = []
        eids = []
        for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\
                .groupby(['enrollment_id']):
            params.append(df)
            eids.append(eid)
        pool = par.Pool(processes=min(n_proc, len(params)))
        event_count_by_eid = dict(
            zip(eids, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(event_count_by_eid, pkl_path)

    X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']])

    logger.debug('source-event pairs counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X0)), repr(X0.shape))

    pkl_path = util.cache_path('D_full_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        D_full = util.fetch(pkl_path)
    else:
        D_full = pd.merge(Enroll_all, Log, on=['enrollment_id'])

        util.dump(D_full, pkl_path)

    pkl_path = util.cache_path('user_wn_courses_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_wn_courses = util.fetch(pkl_path)
    else:
        user_wn_courses = {}
        for u, df in D_full.groupby(['username']):
            x = []
            for wn in __week_span__:
                x.append(len(df[df['week_diff'] == wn]['course_id'].unique()))
            user_wn_courses[u] = x

        util.dump(user_wn_courses, pkl_path)

    X1 = np.array([user_wn_courses[u] for u in Enroll['username']])

    logger.debug('courses by user counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X1)), repr(X1.shape))

    pkl_path = util.cache_path('course_population_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_population = util.fetch(pkl_path)
    else:
        course_population = {}
        for c, df in D_full.groupby(['course_id']):
            course_population[c] = len(df['username'].unique())

        util.dump(course_population, pkl_path)

    X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course population counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X2)), repr(X2.shape))

    pkl_path = util.cache_path('course_dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_dropout_count = util.fetch(pkl_path)
    else:
        course_dropout_count = course_population.copy()
        for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']):
            course_dropout_count[c] -= len(df['username'].unique())

        util.dump(course_dropout_count, pkl_path)

    X3 = np.array(
        [course_dropout_count.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course dropout counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X3)), repr(X3.shape))

    pkl_path = util.cache_path('user_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_ops_count = util.fetch(pkl_path)
    else:
        user_ops_on_all_courses = D_full.groupby(
            ['username', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        users = []
        for u, df in user_ops_on_all_courses.groupby(['username']):
            params.append(df)
            users.append(u)
        pool = par.Pool(processes=min(n_proc, len(params)))
        user_ops_count = dict(
            zip(users, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(user_ops_count, pkl_path)

    X4 = X0 / [user_ops_count[u] for u in Enroll['username']]
    X4[np.isnan(X4)] = 0

    logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X4)), repr(X4.shape))

    pkl_path = util.cache_path('course_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_ops_count = util.fetch(pkl_path)
    else:
        course_ops_of_all_users = D_full.groupby(
            ['course_id', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        courses = []
        for c, df in course_ops_of_all_users.groupby(['course_id']):
            params.append(df)
            courses.append(c)
        pool = par.Pool(processes=min(n_proc, len(params)))
        course_ops_count = dict(
            zip(courses, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(course_ops_count, pkl_path)

    X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']]
    X5[np.isnan(X5)] = 0

    logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s',
                 np.any(np.isnan(X5)), repr(X5.shape))

    X6 = np.array([
        course_dropout_count.get(c, 0) / course_population.get(c, 1)
        for c in Enroll['course_id']
    ])

    logger.debug('dropout ratio of courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X6)), repr(X6.shape))

    Obj = util.load_object()
    Obj = Obj[Obj['start'] <= base_date]
    course_time = {}
    for c, df in Obj.groupby(['course_id']):
        start_time = np.min(df['start'])
        update_time = np.max(df['start'])
        course_time[c] = [(base_date - start_time).days,
                          (base_date - update_time).days]

    avg_start_days = np.average([t[0] for _, t in course_time.items()])
    avg_update_days = np.average([t[1] for _, t in course_time.items()])
    default_case = [avg_start_days, avg_update_days]

    X7 = np.array(
        [course_time.get(c, default_case)[0] for c in Enroll['course_id']])

    logger.debug('days from course first update, has nan: %s, shape: %s',
                 np.any(np.isnan(X7)), repr(X7.shape))

    X8 = np.array(
        [course_time.get(c, default_case)[1] for c in Enroll['course_id']])

    logger.debug('days from course last update, has nan: %s, shape: %s',
                 np.any(np.isnan(X8)), repr(X8.shape))

    user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\
        .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\
        .fillna(0)
    X9 = np.array(user_ops_time['day_diff']['amin'])

    logger.debug('days from user last op, has nan: %s, shape: %s',
                 np.any(np.isnan(X9)), repr(X9.shape))

    X10 = np.array(user_ops_time['day_diff']['amax'])

    logger.debug('days from user first op, has nan: %s, shape: %s',
                 np.any(np.isnan(X10)), repr(X10.shape))

    X11 = X7 - X10

    logger.debug(
        'days from course first update to user first op, has nan: %s'
        ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape))

    X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11]
    util.dump(X, X_pkl_path)

    return X
Beispiel #29
0
        raise SystemExit("%s has no section." % (fname,))
    if not args:
        raise SystemExit("No target was given.")
    target = args[0]
    try:
        ck = parser.get(target, "consumer_key")
        secret = parser.get(target, "consumer_secret")
        if not (ck and secret):
            msg = ("No consumer token was found.", "Check 'consumer_key' and 'consumer_secret' on %s." % (target))
            raise SystemExit("\n".join(msg))
    except ConfigParser.NoOptionError, e:
        raise SystemExit(e.message)
    consumer_token = (ck, secret)

    sample_user = "******"
    API = load_object(SERVICE[target])
    ret = models.find(models.AccessToken, service_provider_name=target, user_name=sample_user)
    if ret:
        access_token = (ret.oauth_token_key, ret.oauth_token_secret)
        client = create_client(consumer_token, access_token)
        api = API(client)
        if target in SERVICE_SAMPLES:
            run = load_object(SERVICE_SAMPLES[target])
            run(api)
        else:
            logging.warn("No sample was found for %s." % (target,))
    else:
        client = create_client(consumer_token)
        api = API(client)
        access_token = api.initialize()
        if access_token: