Example #1
0
def draw_result(enn_net):
    # Get the latest parameters, and initialize the enn net
    param_list = get_file_list('params', config.path)
    params = pickle.load(open(param_list[-1], 'rb'))
    print("use parameter file: {}".format(param_list[-1]))
    enn_net.set_parameter(params)
    # Draw the result of well in test well lists
    for well in config.test_ID:
        input_, target = text.test_dataset(well)
        pred_enn = predict_full(input_, params=params,
                                model_predict=enn_net).cpu()
        # output the loss
        loss = criterion(pred_enn.mean(0), torch.tensor(target).float())
        print("well{}\t test loss: {}".format(well, loss))
        # get the real predicted and target data
        pred = text.inverse_normalize(pred_enn.mean(0))
        target = text.inverse_normalize(target)
        std = 3 * np.array(text.inverse_normalize(pred_enn).std(0))
        # save the test loss
        save_txt('{}/test_loss.txt'.format(PATH),
                 '{}, {}\n'.format(loss, std.mean()))
        print('std:', std.mean())
        x = np.arange(len(target))
        plt.figure(figsize=(60, 5))
        plt.plot(target, label='target', color='black', alpha=0.4)
        plt.errorbar(x, pred[:, 0], yerr=std[:, 0], color='red', alpha=0.7)
        plt.title(config.info)
        plt.legend()
        ylabel = config.columns[config.input_dim + 1]
        plt.ylabel(ylabel)
        plt.savefig('{}/result.png'.format(PATH))
        plt.show()
def prep():
    ref_docs = []
    path_refs = "./data/cnn_stories_tokenized"
    topic_path, _, files = next(os.walk(path_refs))

    for fl in tqdm(files):
        txt = re.sub(
            '\s\s+', " ",
            re.sub(
                "\n", " ",
                re.search(r"(.|\n)*?(?=@highlight)",
                          open(topic_path + "/" + fl).read()).group()))
        ref_docs.append(txt.lower())

    save_txt("./data/ref_docs.txt", ref_docs, split="")

    ### cleaned version
    ref_docs = open("./data/ref_docs.txt").read().split("\n")

    remove = set(stopwords.words("english"))
    remove.update(list(string.punctuation))

    ref_docs_clean = []
    for doc in tqdm(ref_docs):
        ref_docs_clean.append(" ".join([
            " ".join([
                word for word in word_tokenize(sent.lower())
                if word not in remove
            ]) for sent in sent_tokenize(doc)
        ]))
    save_txt("./data/ref_docs_clean.txt", ref_docs_clean, split="")
Example #3
0
def test1(enn_net, feature_name='', draw_result=False):
    # Get the latest parameters, and initialize the enn net
    param_list = get_file_list('{}_params'.format(feature_name), config.path)
    params = pickle.load(open(param_list[-1], 'rb'))
    print("use parameter file: {}".format(param_list[-1]))
    enn_net.set_parameter(params)
    # Draw the result of well in test well lists
    for well in config.test_ID:
        input_, target_ = text.test_dataset(well)
        pred_enn_ = predict_full(input_, params=params,
                                 model_predict=enn_net).cpu()
        std_ = 3 * np.array(text.inverse_normalize(pred_enn_).std(0))
        pred_ = np.array(text.inverse_normalize(pred_enn_.mean(0)))
        target_real_ = text.inverse_normalize(target_)
        for i, feature in enumerate(config.columns_target):
            # output the loss
            pred_enn = pred_enn_[:, :, i]
            target = target_[:, i]
            std = std_[:, i]
            pred = pred_[:, i]
            target_real = target_real_[:, i]
            # save the unnormalized pred data
            np.savetxt('result/e{}_pred_{}_unnormalized.csv'.format(
                config.experiment_ID, feature),
                       np.array(pred_enn),
                       delimiter=',')
            loss = criterion(pred_enn.mean(0), torch.tensor(target).float())
            print("well{}\t{}\ttest loss: {}".format(well, feature, loss))
            # replace the test dataset and reset train dataset
            # text.df_list[well-1][[feature_name]] = np.array(text.inverse_normalize(pred_enn.mean(0)))
            # save the test loss
            save_txt('{}/test_loss_{}.txt'.format(PATH, feature),
                     '{}, {}, {}\n'.format(feature, loss, std.mean()))
            print('std:', std.mean())
            if draw_result:
                # get the real predicted and target data
                x = np.arange(len(target_real))
                # np.savetxt('result/e{}_pred_{}.csv'.format(config.experiment_ID, feature_name),
                #            np.array(text.inverse_normalize(pred_enn)[:, :, 0]).T, delimiter=',')
                # np.savetxt('result/e{}_target_{}.csv'.format(config.experiment_ID, feature_name), target.T, delimiter=',')
                plt.figure(figsize=(60, 5))
                plt.plot(target_real, label='target', color='black', alpha=0.4)
                plt.errorbar(x, pred, yerr=std, color='red', alpha=0.7)
                plt.title(config.info)
                plt.legend()
                y_label = feature
                plt.ylabel(y_label)
                plt.tight_layout()
                plt.savefig('{}/result_{}.png'.format(PATH, feature))
                continue
Example #4
0
def summarize(text_folder, out_path, topic_threshold, sim_threshold, n_top,
              remove_stopwords, remove_punct, language, topic, tfidf_path,
              refdoc_path, load_tfidf_model, save_tfidf_model, limit_type, limit, reorder):


    ### LOAD EMBEDDING MODEL
    model = load_embedding(language, topic)

    # load data
    txt = [open(text_folder+ "/" + f).read() for f in os.listdir(text_folder)]
    ### Preprocess the data
    plain_txt = " ".join(txt)

    # remove stopwords and punctuation
    remove = set()
    if remove_stopwords:
        remove.update(stopwords.words(language))
    if remove_punct:
        remove.update(list(string.punctuation))

    clean_txt, raw_sents = clean_txts(plain_txt, remove)

    # GET TOPIC WORDS
    centroid_words_weights, tfidf_scores, feature_names = topic_words(
        clean_txt, tfidf_path, topic_threshold, load=load_tfidf_model, refdoc_path=refdoc_path, save=save_tfidf_model)

    # weight sentences
    scores = weight_sentences(txt, centroid_words_weights, tfidf_scores, feature_names, remove)

    # if multidocument, select only top sentences
    if len(txt)>1:
        clean_sents, raw_sents = select_ntop(txt, scores, n_top, remove)


    # get centroid words
    centroid_words = list(centroid_words_weights.keys())
    centroid_vector = get_centroid(centroid_words, model)

    # score sentences
    sentence_scores = score_sentences(clean_sents, raw_sents, model, centroid_vector)

    # select sentences
    summary = select_sentences(sentence_scores, sim_threshold, limit_type, limit, reorder)
    save_txt(out_path, summary)
    return summary
Example #5
0
def eval(neural_net, data, labels):
    classes, alphas = neural_net.scan(data, gen.get_default_total_code())

    data3ch = util.cvtColorGrey2RGB(data)
    red = np.array([1.0, 0.0, 0.0], dtype=np.float32)
    for b in xrange(alphas.shape[0]):
        for c in xrange(alphas.shape[1]):
            data3ch[b, c, int(np.floor((1.0 - alphas[b, c]) * (data3ch.shape[2] - 1))), :] = red
    tile = util.make_tile(data3ch, rows=600, cols=800, flip=True)
    util.numpy_to_image(tile).show()

    # now get only classess corresponding to high alphas
    index_output = np.argmax(classes, axis=2)
    util.save_txt(index_output, "../artifacts/" + "data.out")

    count = 0
    correct = 0
    for b in xrange(labels.shape[0]):
        for c in xrange(labels.shape[1]):
            if labels[b, c] > 0:
                correct += 1 if labels[b, c] == index_output[b, c] else 0
                count += 1
    print "Percent correct = ", correct * 100.0 / count

    collector = []
    for b in xrange(alphas.shape[0]):
        read_index = 0
        converted = gen.indices_to_unicode(index_output[b])
        read_word = u""
        for c in xrange(alphas.shape[1]):
            if alphas[b, c] > 0.5:
                read_word = read_word + converted[read_index]
                read_index = read_index + 1
        print read_word
        collector.append(read_word)

    return collector
Example #6
0
def test(enn_net, feature_name='', draw_result=False):
    # Get the latest parameters, and initialize the enn net
    param_list = get_file_list('{}_params'.format(feature_name), config.path)
    params = pickle.load(open(param_list[-1], 'rb'))
    print("use parameter file: {}".format(param_list[-1]))
    enn_net.set_parameter(params)
    # Draw the result of well in test well lists
    for well in config.test_ID:
        input_, target = text.test_dataset(well)
        pred_enn = predict_full(input_, params=params,
                                model_predict=enn_net).cpu()
        # output the loss
        loss = criterion(pred_enn.mean(0), torch.tensor(target).float())
        print("well{}\t{}\ttest loss: {}".format(well, feature_name, loss))
        # replace the test dataset and reset train dataset
        text.df_list[well - 1][[feature_name]] = np.array(
            text.inverse_normalize(pred_enn.mean(0)))
        # get the std
        std = 3 * np.array(text.inverse_normalize(pred_enn).std(0))
        # save the test loss
        save_txt('{}/test_loss_{}.txt'.format(PATH, feature_name),
                 '{}, {}, {}\n'.format(feature_name, loss, std.mean()))
        print('std:', std.mean())
        if draw_result:
            # get the real predicted and target data
            pred = np.array(text.inverse_normalize(pred_enn.mean(0)))
            target = text.inverse_normalize(target)
            x = np.arange(len(target))
            plt.figure(figsize=(60, 5))
            plt.plot(target, label='target', color='black', alpha=0.4)
            plt.errorbar(x, pred[:, 0], yerr=std[:, 0], color='red', alpha=0.7)
            plt.title(config.info)
            plt.legend()
            y_label = feature_name
            plt.ylabel(y_label)
            plt.tight_layout()
            plt.savefig('{}/result_{}.png'.format(PATH, feature_name))
Example #7
0
def train(net_enn, input_, target, feature_name=''):
    dstb_y = lamuda.Lamuda(target, NE, ERROR_PER)
    train_losses = Record()
    losses = Record()
    lamuda_history = Record()
    std_history = Record()
    pred_history = Record()

    initial_parameters = net_enn.initial_parameters
    initial_pred = net_enn.output(input_)
    train_losses.update(criterion(initial_pred.mean(0), target).tolist())
    losses.update(criterion(initial_pred.mean(0), target).tolist())
    std_history.update(dstb_y.std(initial_pred))
    pred_history.update(initial_pred)
    lamuda_history.update(dstb_y.lamuda(initial_pred))

    for j in range(T):
        torch.cuda.empty_cache()
        params = net_enn.get_parameter()
        dstb_y.update()
        time_ = time.strftime('%Y%m%d_%H_%M_%S')
        delta = enrml.EnRML(pred_history.get_latest(mean=False), params,
                            initial_parameters,
                            lamuda_history.get_latest(mean=False), dstb_y.dstb,
                            ERROR_PER)
        params_raw = net_enn.update_parameter(delta)
        torch.cuda.empty_cache()
        pred = net_enn.output(input_)
        loss_new = criterion(pred.mean(0), target).tolist()
        bigger = train_losses.check(loss_new)
        record_while = 0
        while bigger:
            record_while += 1
            lamuda_history.update(
                lamuda_history.get_latest(mean=False) * GAMMA)
            if lamuda_history.get_latest(mean=False) > GAMMA**10:
                lamuda_history.update(lamuda_history.data[0])
                print('abandon current iteration')
                net_enn.set_parameter(params)
                loss_new = train_losses.get_latest()
                dstb_y.update()
                params_raw = params
                break
            dstb_y.update()
            net_enn.set_parameter(params)
            delta = enrml.EnRML(pred_history.get_latest(mean=False), params,
                                initial_parameters,
                                lamuda_history.get_latest(mean=False),
                                dstb_y.dstb, ERROR_PER)
            params_raw = net_enn.update_parameter(delta)
            torch.cuda.empty_cache()
            pred = net_enn.output(input_)
            loss_new = criterion(pred.mean(0), target).tolist()
            print('update losses, new loss:{}'.format(loss_new))
            bigger = train_losses.check(loss_new)
        train_losses.update(loss_new)
        save_var(params_raw, '{}/{}_{}_params'.format(PATH, time_,
                                                      feature_name))
        print("iteration:{} \t current train losses:{}".format(
            j, train_losses.get_latest(mean=True)))
        save_txt(
            '{}/loss_{}.txt'.format(PATH, feature_name),
            time.strftime('%Y%m%d_%H_%M_%S') + ',' +
            str(train_losses.get_latest(mean=True)) + ',\n')
        pred_history.update(pred)
        std_history.update(dstb_y.std(pred))
        if std_history.bigger():
            lamuda_history.update(lamuda_history.get_latest(mean=False))
        else:
            lamuda_tmp = lamuda_history.get_latest(mean=False) / GAMMA
            if lamuda_tmp < 0.005:
                lamuda_tmp = 0.005
            lamuda_history.update(lamuda_tmp)
    return net_enn, train_losses.get_latest(
        mean=True), pred_history.get_latest(mean=False)
Example #8
0
def run():
    save_txt('{}/time.txt'.format(PATH),
             '{},\n'.format(time.strftime('%Y%m%d_%H_%M_%S')))
    for epoch in range(config.epoch):
        print(epoch)
        while config.input_dim + 1 <= len(config.columns):
            current_feature_name = config.columns[config.input_dim]
            textLoader = DataLoader(text,
                                    batch_size=config.batch_size,
                                    shuffle=True,
                                    num_workers=config.num_workers,
                                    drop_last=config.drop_last)
            model = netLSTM_withbn()
            with torch.no_grad():
                model = model.cuda()
            net_enn_train = enn.ENN(model, NE)
            # If pre_existent epoch found, set net_enn_train parameters with pre_existent epoch record.
            # Only processed if current epoch count is 0
            epoch_list = [
                i for i in os.listdir(PATH) if i.startswith(
                    "parameters_{}_epoch_".format(current_feature_name))
            ]
            if len(epoch_list) > 0 and epoch == 0:
                print("Pre_existent epoch found: {}".format(
                    sorted(epoch_list)[-1]))
                epoch_pre_existent = pickle.load(
                    open(os.path.join(PATH,
                                      sorted(epoch_list)[-1]), 'rb'))
                net_enn_train.set_parameter(epoch_pre_existent)
            if epoch > 0:
                parameter_path = os.path.join(
                    PATH,
                    "parameters_{}_epoch_{}".format(current_feature_name,
                                                    epoch - 1))
                print("Setting checkpoint {}".format(parameter_path))
                parameter_checkpoint = pickle.load(open(parameter_path, 'rb'))
                net_enn_train.set_parameter(parameter_checkpoint)
            for i, data in enumerate(textLoader):
                print('#' * 30)
                print("{}: batch{}".format(time.strftime('%Y%m%d_%H_%M_%S'),
                                           i))
                # preparing the train data
                input_, target = data
                input_ = torch.from_numpy(
                    np.stack(list(shrink(input_, config.shrink_len)), axis=1))
                target = torch.from_numpy(
                    np.stack(list(shrink(target, config.shrink_len)), axis=1))
                with torch.no_grad():
                    input_, target = map(Variable,
                                         (input_.float(), target.float()))
                    target = target.reshape(-1, config.output_dim)
                    input_ = input_.cuda()
                    target = target.cuda()
                # train the model
                net_enn_train, loss, pred_data = train(
                    net_enn_train,
                    input_,
                    target,
                    feature_name=current_feature_name)
                # save result
                #save_txt('predict_history'+'/pred.txt', list_to_csv(np.array(pred_data.mean(0)[:, 0])) + '\n')
                #save_txt('predict_history'+'/target.txt', list_to_csv(np.array(target[:, 0])) + '\n')
                save_txt(
                    PATH + '/time.txt',
                    time.strftime('%Y%m%d_%H_%M_%S') + ',' + str(loss) + ',\n')
            with torch.no_grad():
                params = net_enn_train.get_parameter()
                filename = PATH + "/parameters_{}_epoch_{}".format(
                    current_feature_name, epoch)
                save_var(params, filename)
                del params
            #test(net_enn_train, feature_name=current_feature_name, draw_result=(epoch == config.epoch-1))
            test1(net_enn_train,
                  feature_name=current_feature_name,
                  draw_result=True)
            config.input_dim += config.output_dim
            text.reset_train_dataset()
        config.input_dim -= config.output_dim
        text.reset_train_dataset()
        text.reset_test_dataset()