Beispiel #1
0
    def Train(self, mini_batches, epoch, best_f_score, options):
        print 'Start time', time.ctime()
        start = time.time()
        errs, loss, iters, sen_num = [], 0, 0, 0
        dev_path = options.conll_dev

        part_size = len(mini_batches) / 5
        part = 0
        best_part = 0

        for b, mini_batch in enumerate(mini_batches):
            e = self.buildGraph(mini_batch, True)
            errs += e
            sum_errs = esum(errs) / len(errs)
            loss += sum_errs.scalar_value()
            sum_errs.backward()
            self.trainer.update()
            renew_cg()
            self.x_le.init_row(self.NO_LEMMA, [0] * self.d_l)
            renew_cg()
            print 'loss:', loss / (
                b + 1), 'time:', time.time() - start, 'progress', round(
                    100 * float(b + 1) / len(mini_batches), 2), '%'
            loss, start = 0, time.time()
            errs, sen_num = [], 0
            iters += 1

            if (b + 1) % part_size == 0:
                part += 1

                if dev_path != '':
                    start = time.time()
                    write_conll(
                        os.path.join(options.outdir, options.model) +
                        str(epoch + 1) + "_" + str(part) + '.txt',
                        self.Predict(dev_path))
                    os.system('perl src/utils/eval.pl -g ' + dev_path +
                              ' -s ' +
                              os.path.join(options.outdir, options.model) +
                              str(epoch + 1) + "_" + str(part) + '.txt' +
                              ' > ' +
                              os.path.join(options.outdir, options.model) +
                              str(epoch + 1) + "_" + str(part) + '.eval')
                    print 'Finished predicting dev on part ' + str(
                        part) + '; time:', time.time() - start

                    labeled_f, unlabeled_f = get_scores(
                        os.path.join(options.outdir, options.model) +
                        str(epoch + 1) + "_" + str(part) + '.eval')
                    print 'epoch: ' + str(epoch) + ' part: ' + str(
                        part) + '-- labeled F1: ' + str(
                            labeled_f) + ' Unlabaled F: ' + str(unlabeled_f)

                    if float(labeled_f) > best_f_score:
                        self.Save(os.path.join(options.outdir, options.model))
                        best_f_score = float(labeled_f)
                        best_part = part

        print 'best part on this epoch: ' + str(best_part)
        return best_f_score
Beispiel #2
0
 def run_game(self, num_players=2, score_limit=15):
     # run a game between num_players players
     assert(num_players > 0)
     players = dict([(str(r+1),0) for r in range(num_players)])
     player_index = 0
     print "Starting a game between", num_players, "players"
     print "Enter moves as a row and column separated by a space, i.e.: R C"
     raw_input("Ready?")
     # should add a check to see if there are no moves left
     while all([score<score_limit for score in players.values()]):
         p = players.keys()[player_index]
         player_index = (player_index+1)%num_players
         good_choice = False
         print self.board
         while not good_choice:
             choice = raw_input("Player "+p+", place a piece: ").strip()
             try: r, c = map(int, choice.split(" "))
             except Exception:
                 print "Please format as two integers: R C"
                 continue
             if 0 <= r < len(self.board) and 0 <= c < len(self.board[0]):
                 if self.board[r][c] == self.board.default:
                     self.board[r][c] = p
                     squares = get_squares(self.board, (p, Point(r,c)))
                     score = get_scores(squares)
                     players[p] += score.get(p, 0)
                     print "Player",p,"score:",players[p]
                     good_choice = True
                 else: print "That square is occupied."
             else: print "Choice out of range."
     print "Final scores:"
     pprint.pprint(players)
Beispiel #3
0
def test(model, test_loader, topk):
    model.eval()
    test_steps = (len(test_loader.dataset) // test_loader.batch_size) + 1
    scores = []
    with torch.no_grad():
        with trange(test_steps) as t:
            for i, data in zip(t, test_loader):
                t.set_description('test')
                users = data[:, 0]
                items = data[:, 1]
                labels = data[:, 2].float()
                if use_cuda:
                    users, items, labels = users.cuda(), items.cuda(
                    ), labels.cuda()
                preds = model(users, items)
                items_cpu = items.cpu().numpy()
                preds_cpu = preds.squeeze(1).detach().cpu().numpy()
                litems = np.split(items_cpu, test_loader.batch_size // 100)
                lpreds = np.split(preds_cpu, test_loader.batch_size // 100)
                scores += [
                    get_scores(it, pr, topk) for it, pr in zip(litems, lpreds)
                ]
    hits = [s[0] for s in scores]
    ndcgs = [s[1] for s in scores]
    return (np.array(hits).mean(), np.array(ndcgs).mean())
Beispiel #4
0
def check_true(question, verbose=True, attribution=False):
    keys = []
    with open('key.txt', 'r') as f:
        keys = f.readlines()
    query = {
        'key': keys[0].rstrip(),  #generate API key first
        'cx':
        keys[1].rstrip(),  #create custom search using Google Custom Search API
        'q': question
    }
    response = requests.get(api_url,
                            params=query,
                            headers={'Content-Type': 'application/json'})
    if response.status_code != 200:
        # This means something went wrong.
        print('DID NOT WORK!')
    else:
        # pdb.set_trace()
        resp_json = response.json()
        # print ('HELLLOOOOOOOO')
        # print (resp_json.url)
        urls = []
        snippets = []
        for item in resp_json['items']:
            sentence = sanitizer2(item['snippet'])
            snippets.append(sentence)
            urls.append(item['link'])
            #print ('\'' + item['snippet'].rstrip('...').lstrip('...') + '\'')
            # print (item['formattedUrl'])
            # print ('#############')
        if verbose:
            print('\n'.join(snippets))

        res_scores = get_scores(snippets,
                                question,
                                verbose=verbose,
                                attribution=attribution,
                                urls=urls)
        if verbose:
            print('RES SCORES')
            print(res_scores)

        top_score = res_scores[0][1]
        if top_score >= threshold_high:
            res = 1
            if verbose: print("TRUE")
        elif top_score < threshold_low:
            res = 0
            if verbose: print("FALSE")
        else:
            res = 0.5
            if verbose: print("WE ARE UNSURE")

        if not attribution:
            return res
        else:
            return res, res_scores

    return res
def evaluate_on_holdout(model) -> float:
    data = pd.read_csv('./data/holdout.csv')
    x_cols = [x for x in data.columns if (x not in ['ID_code', 'target'])]

    X = data[x_cols]
    y = data['target']

    y_pred = model.predict(X)
    y_true = np.array(y)
    score = utils.get_scores(y_true, y_pred)["f1"]

    return score
Beispiel #6
0
    def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination,
                **kwargs):
        print(f"Outlier detection using pyod's {self.pyod_model}")
        od = self.pyod_model(**kwargs)
        od.fit(dim_reduced_vecs)

        out_pred = od.labels_
        out_pred[out_pred == 1] = -1
        out_pred[out_pred == 0] = 1

        scores = get_scores(scores, outlier_labels, out_pred)
        scores.update(**kwargs)
        out_f1 = scores["out_f1"]
        print(f"{kwargs}\nOut_f1: {out_f1}\n\n")
        return scores, out_pred
Beispiel #7
0
    def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination,
                **kwargs):
        od = self.dem_red_outlier_model(**kwargs)
        if self.as_numpy:
            dim_reduced_vecs = np.array(dim_reduced_vecs)
        preds = od.fit_transform(dim_reduced_vecs)
        preds = preds.astype(float)

        preds = self.reject_outliers(preds, iq_range=1.0 - contamination)
        preds = [-1 if x else 1 for x in preds]

        scores = get_scores(scores, outlier_labels, preds)
        scores.update(**kwargs)
        out_f1 = scores["out_f1"]
        print(f"{kwargs}\nOut_f1: {out_f1}\n\n")
        return scores, preds
Beispiel #8
0
 def move(self, r, c, player_name):
     # check we should make move
     if not self.is_game_started():
         return False, "Move not made - game not yet started."
     if player_name not in self.players:
         return False, "Move not made - player not part of this game."
     # attempt to make a move
     if not (0 <= r < len(self.board) and 0 <= c < len(self.board[0])):
         return False, "Move not made - out of range."
     if self.board[r][c] != self.board.default:
         return False, "Move not made - that space is occupied."
     # otherwise, make move
     self.board[r][c] = player_name
     # and update scores
     squares = get_squares(self.board, (player_name, Point(r, c)))
     scores = get_scores(squares)
     self.players[player_name] += scores.get(player_name, 0)
     return True, "Move successful."
Beispiel #9
0
 def move(self, r, c, player_name):
     # check we should make move
     if not self.is_game_started():
         return False, "Move not made - game not yet started."
     if player_name not in self.players:
         return False, "Move not made - player not part of this game."
     # attempt to make a move
     if not (0 <= r < len(self.board) and 0 <= c < len(self.board[0])):
         return False, "Move not made - out of range."
     if self.board[r][c] != self.board.default:
         return False, "Move not made - that space is occupied."
     # otherwise, make move
     self.board[r][c] = player_name
     # and update scores
     squares = get_squares(self.board, (player_name, Point(r,c)))
     scores = get_scores(squares)
     self.players[player_name] += scores.get(player_name, 0)
     return True, "Move successful."
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True):
    in_dim, out_dim = data_gen.get_dims()
    x_coresets, y_coresets = [], []
    x_testsets, y_testsets = [], []

    all_acc = np.array([])

    for task_id in range(data_gen.max_iter):
        x_train, y_train, x_test, y_test = data_gen.next_task()
        x_testsets.append(x_test)
        y_testsets.append(y_test)

        # Set the readout head to train
        head = 0 if single_head else task_id
        bsize = x_train.shape[0] if (batch_size is None) else batch_size

        # Train network with maximum likelihood to initialize first model
        if task_id == 0:
            ml_model = Vanilla_NN(in_dim, hidden_size, out_dim, x_train.shape[0])
            ml_model.train(x_train, y_train, task_id, no_epochs, bsize)
            mf_weights = ml_model.get_weights()
            mf_variances = None
            ml_model.close_session()

        # Train on non-coreset data
        mf_model = MFVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances)
        mf_model.train(x_train, y_train, head, no_epochs, bsize)
        mf_weights, mf_variances = mf_model.get_weights()

        # Select coreset if needed
        if coreset_size > 0:
            if type(coreset_method) is str and coreset_method == "uncertainty_based":
                x_coresets, y_coresets, x_train, y_train = uncertainty_based(
                    mf_model, task_id, x_coresets, y_coresets, x_train, y_train, coreset_size)
            else:
                x_coresets, y_coresets, x_train, y_train = coreset_method(x_coresets, y_coresets, x_train, y_train, coreset_size)

        # Incorporate coreset data and make prediction
        acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size)
        all_acc = utils.concatenate_results(acc, all_acc)

        mf_model.close_session()

    return all_acc
Beispiel #11
0
def evaluate(model, test_loader, use_cuda, topk):
    model.eval()
    scores = []
    with torch.no_grad():
        for data in test_loader:
            users = data[:, 0]
            items = data[:, 1]
            labels = data[:, 2].float()
            if use_cuda:
                users, items, labels = users.cuda(), items.cuda(), labels.cuda(
                )
            preds = model(users, items)
            items_cpu = items.cpu().numpy()
            preds_cpu = preds.squeeze(1).detach().cpu().numpy()
            litems = np.split(items_cpu, test_loader.batch_size // 100)
            lpreds = np.split(preds_cpu, test_loader.batch_size // 100)
            scores += [
                get_scores(it, pr, topk) for it, pr in zip(litems, lpreds)
            ]
    hits = [s[0] for s in scores]
    ndcgs = [s[1] for s in scores]
    return (np.array(hits).mean(), np.array(ndcgs).mean())
Beispiel #12
0
    def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination,
                min_cluster_size, allow_noise):
        print("Clustering ...")
        clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                            prediction_data=True,
                            metric="euclidean").fit(dim_reduced_vecs)
        print("Get prediction data ...")
        clusterer.generate_prediction_data()

        try:
            cluster_pred = clusterer.labels_ if allow_noise else np.argmax(
                all_points_membership_vectors(clusterer)[:, 1:], axis=1)
        except IndexError:
            print(
                "Got IndexError and will not enforce cluster membership (allow noise) ..."
            )
            print(all_points_membership_vectors(clusterer))
            cluster_pred = clusterer.labels_

        # scoring
        print("Get scores ...")

        # GLOSH
        threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
        outlier_pred = np.where(clusterer.outlier_scores_ > threshold, -1, 1)

        scores["cluster_n"] = len(np.unique(clusterer.labels_))
        scores["homogeneity"] = homogeneity_score(outlier_labels, cluster_pred)
        scores["completeness"] = completeness_score(outlier_labels,
                                                    cluster_pred)
        scores["v_measure"] = v_measure_score(outlier_labels, cluster_pred)

        scores = get_scores(scores, outlier_labels, outlier_pred)

        print(
            f"Homogeneity - {homogeneity_score(outlier_labels, cluster_pred)*100:.1f}  \
                cluster_n - {len(np.unique(clusterer.labels_))}")

        return scores, clusterer.outlier_scores_
Beispiel #13
0
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True):
  
  in_dim, out_dim = data_gen.get_dims()
  x_coresets, y_coresets = [], []
  x_testsets, y_testsets = [], []
  
  all_acc = np.array([])
  
  for task_id in range(data_gen.max_iter):
    
    x_train, y_train, x_test, y_test = data_gen.next_task()
    x_testsets.append(x_test)
    y_testsets.append(y_test)
    
    head = 0 if single_head else task_id
    bsize = x_train.shape[0] if (batch_size is None) else batch_size
    
    if task_id == 0:
      
      ml_model = VCL(in_dim, hidden_size, output_size=10).
      ml_model.train(x_train, y_train, bsize, no_epochs, task_id)
      torch.save(ml_model.state_dict(), 'my_model.pth')
    
    else: 
    
      ml_model = VCL(in_dim, hidden_size, output_size=10)
      ml_model.load_state_dict(torch.load('my_model.pth'))
      ml_model.bfc3 = vcl_model.BayesLinear(hidden_size, 10)
      ml_model.train(x_train, y_train, bsize, no_epochs, task_id)
      torch.save(ml_model.state_dict(), 'my_model.pth')
    
    if coreset_size > 0:
      x_coresets, y_coresets, x_train, y_train = coreset_method(x_coresets, y_coresets, x_train, y_train, coreset_size)
    
    acc = utils.get_scores(ml_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size)
    all_acc = utils.concatenate_results(acc, all_acc)
    
  return all_acc
def upload_model():
    ''' 
    Receives a base64 encoded joblib file to be saved as a new model.

    INPUT:
    - input: { model_name : string,
               model: base64 encoded joblib file }
    OUTPUT:
    - output: message
              status (successful or not)
    '''
    input_asDict = request.get_json()

    logger.info("Received [Upload Model Request] (0/3) ... ")

    # Sees if model name is available
    try:
        # Check if Model Name and Model Type are present
        if ("model_name" in input_asDict.keys()
                and input_asDict["model_name"]):
            model_name = input_asDict["model_name"]
        else:
            model_name = default_model_name
        filepath = "./models/" + model_name + ".joblib.dat"
        if file_exists(filepath):
            message = "Model with given name already exists. Choose another name."
            logger.error(message)
            return custom_response_http(message, 400)
    except:
        message = "Internal Server Error"
        logger.info(message)
        return custom_response_http(message, 500)
    logger.info("[Upload Model Request] Model Name Set: [" + model_name +
                "] (1/3) ... ")

    # Decodes Model
    try:
        if ("model" in input_asDict.keys()):
            model_as64 = input_asDict["model"]
            logger.info(str(model_as64)[:10])
            decoded_model = base64.b64decode(model_as64)
            with open(filepath, 'wb') as fh:
                fh.write(decoded_model)
                with open(filepath, 'rb') as model_file:
                    # Load model
                    model = joblib.load(model_file)
        else:
            message = "Model does not exist in request."
            logger.error(message)
            return custom_response_http(message, 400)
    except:
        message = "Internal Server Error"
        logger.error(message)
        return custom_response_http(message, 500)
    logger.info("[Upload Model Request] Model Decoded (2/3) ... ")

    # Validate Model
    try:
        # Validate with holdout Set
        data = pd.read_csv('./data/holdout.csv')
        x_cols = [x for x in data.columns if (x not in ['ID_code', 'target'])]

        X = data[x_cols]
        y = data['target']

        y_pred = model.predict(X)
        y_true = np.array(y)

        score = get_scores(y_true, y_pred)["f1"]
    except:
        message = "Could not validate model ."
        # remove(filepath)
        logger.error(message)
        return custom_response_http(message, 500)
    logger.info("[Upload Model Request] Model Validated (3/3) .")

    message = "Model Uploaded! Score: " + str(score)
    logger.info(str(custom_response_http(message, 200)))
    return custom_response_http(message, 200)
mongo.close()
mongo_ids = [post.pop('_id', None)
             for post in posts]  # exclude mongo generated ids
posts = d_to_df(posts)
posts['created_time'] = pd.to_datetime(posts['created_time'],
                                       format="%Y-%m-%dT%H:%M:%S+0000")
posts.set_index('created_time', inplace=True)

# Calculating post title and message sentiment
posts['article_title'].fillna('', inplace=True)
posts['article_title_sentiment'] = posts.article_title.apply(
    paragraph_sentiment)
posts['message_sentiment'] = posts.message.apply(paragraph_sentiment)

# Calculating sentiment
bing_scores = get_scores(docs['message'], bing)
afinn_scores = get_scores(docs['message'], afinn)
syuzhet_scores = get_scores(docs['message'], syuzhet)
nrc_scores = get_scores(docs['message'],
                        nrc)  # used version 2 of the nrc lexicon
vader_scores = docs.message.apply(paragraph_sentiment)
all_methods = pd.DataFrame(
    {
        'bing': bing_scores,
        'afinn': afinn_scores,
        'syuzhet': syuzhet_scores,
        'nrc': nrc_scores
    },
    index=docs.index).div(docs.n_sents, axis='index')
all_methods = all_methods.apply(lambda x: map(normalize, x))
all_methods['vader'] = vader_scores
Beispiel #16
0
def main():
    standard_split = [
        ([0, 1, 2, 11], [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15])]
    pairwise_split = list(permutations([[x] for x in range(0, 16)], 2))
    # %%
    param_combinations = product_dict(**dict(
        seed=[42, 43, 44],
        test_size=[0.2],
        labeled_data=[0.1, 0.3,  0.5, 0.8, 1.0],
        fixed_cont=[0.05, 0.1],
        n_oe=[0],
        use_nn=[True],
        pair=standard_split
    ))
    # how many samples per class are used for all tests
    n_class = 3000

    # split the outlier, inlier tuple pairs and print all parameters for run
    for d in param_combinations:
        d["inliers"], d["outliers"] = d["pair"]
        d.pop('pair', None)

    #data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb_vec.pkl"
    data_path = "/home/philipp/projects/dad4td/data/raw/QS-OCR-Large/rvl_cdip.pkl"
    oe_path = "/home/philipp/projects/dad4td/data/processed/oe_data.pkl"
    res_path = next_path(
        "/home/philipp/projects/dad4td/reports/semisupervised/semisup_rvl_pw_%04d.tsv")

    doc2vec_model = Doc2VecModel("apnews", "apnews", 1.0,
                                 100, 1,
                                 "/home/philipp/projects/dad4td/models/apnews_dbow/doc2vec.bin")

    # load data and get the doc2vec vectors for all of the data used
    df_full = pd.read_pickle(data_path)

    # sample only a portion of the data
    df_full = df_full.groupby('target', group_keys=False).apply(
        lambda df: df.sample(n=n_class, random_state=42))

    # %%
    df_full["vecs"] = doc2vec_model.vectorize(df_full["text"])
    df_full["vecs"] = df_full["vecs"].apply(tuple)

    # %%
    result_df = pd.DataFrame()
    for i, params in enumerate(param_combinations):
        print(
            f"\n\n---------------------\n\nRun {i+1} out of {len(param_combinations)}\n\n{params}")

        df, df_test = prepare_data(df_full, **params)

        # UMAP Train
        docvecs, umap_model = umap_reduce(
            df["vecs"].to_list(), df["label"], None, **params)

        # Ivis
        docvecs, ivis_model = ivis_reduce(
            docvecs, df["label"], None, **params)

        # remove OE data, so it's not scored as well
        df["decision_scores"] = docvecs
        df = df.where(df.scorable == 1).dropna()

        # find outliers in 1D scores
        preds, iqr_model = score_out_preds(df["decision_scores"], None,
                                           contamination=df.outlier_label.value_counts(normalize=True)[-1])

        # score the predictions for outliers
        scores = get_scores(dict(), df["outlier_label"], preds)

        # %%
        #  write the scores to df and save
        scores.update(params)
        scores["data"] = "train"
        result_df = result_df.append(scores, ignore_index=True)
        result_df.to_csv(res_path, sep="\t")
        print(f"\nTraining scores:\n{pd.DataFrame([scores], index=[0])}")
        # %%
        # test UMAP and ivis
        docvecs_test, _ = umap_reduce(
            df_test["vecs"].to_list(), None, umap_model, **params)

        docvecs_test, _ = ivis_reduce(docvecs_test, None, ivis_model, **params)

        # remove OE data, so it's not scored as well
        df_test["decision_scores"] = docvecs_test
        df_test = df_test.where(df_test.scorable == 1).dropna()

        # find outliers in 1D scores
        preds = iqr_model.transform(
            df_test["decision_scores"], thresh_factor=1)

        # score the predictions for outliers
        scores = get_scores(dict(), df_test["outlier_label"], preds)

        # write the scores to df and save
        scores.update(params)
        scores["data"] = "test"
        result_df = result_df.append(scores, ignore_index=True)
        result_df.to_csv(res_path, sep="\t")
        print(f"\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
Beispiel #17
0
def run_vcl(hidden_size,
            num_epochs,
            data_generator,
            coreset_method,
            coreset_size=0,
            batch_size=None,
            single_head=True):
    """It runs the variational continual learning algorithm presented in "Variational Continual Learning" (2018) by
    Cuong V. Nguyen et al.

    :param hidden_size:
    :param num_epochs:
    :param data_generator:
    :param coreset_method:
    :param coreset_size:
    :param batch_size:
    :param single_head:
    :return:
    """
    in_dim, out_dim = data_generator.get_dims()

    # TODO: what is difference between coresets and testsets? Maybe coresets are training sets?
    x_coresets, y_coresets = [], []

    x_testsets, y_testsets = [], []

    all_acc = np.array([])

    # max_iter corresponds to the number of tasks (?)
    for task_id in range(data_generator.max_iter):
        x_train, y_train, x_test, y_test = data_generator.next_task()

        x_testsets.append(x_test)
        y_testsets.append(y_test)

        # Set the readout head to train
        head = 0 if single_head else task_id

        bsize = x_train.shape[0] if (batch_size is None) else batch_size

        # Train network with maximum likelihood to initialize first model
        if task_id == 0:
            ml_model = VanillaNN(in_dim, hidden_size, out_dim,
                                 x_train.shape[0])

            ml_model.train(x_train, y_train, task_id, num_epochs, bsize)

            mf_weights = ml_model.get_weights()

            mf_variances = None

            ml_model.close_session()

        # Select coreset if needed
        if coreset_size > 0:
            x_coresets, y_coresets, x_train, y_train = coreset_method(
                x_coresets, y_coresets, x_train, y_train, coreset_size)

        # Train on non-coreset data
        mf_model = MeanFieldVINN(in_dim,
                                 hidden_size,
                                 out_dim,
                                 x_train.shape[0],
                                 prev_means=mf_weights,
                                 prev_log_variances=mf_variances)
        mf_model.train(x_train, y_train, head, num_epochs, bsize)
        mf_weights, mf_variances = mf_model.get_weights()

        # Incorporate coreset data and make prediction
        acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets,
                               y_coresets, hidden_size, num_epochs,
                               single_head, batch_size)

        all_acc = utils.concatenate_results(acc, all_acc)

        mf_model.close_session()

    return all_acc
Beispiel #18
0
def train_test(result_df, df_t, df_r, df_test):

    # data

    x_target = np.array(df_t.vecs.to_list())
    x_ref = np.array(df_r.vecs.to_list())
    y_ref = np.array(df_r.target.to_list())
    y_ref = to_categorical(y_ref)
    test_vecs = np.array(df_test.vecs.to_list())

    n_sup = 10000
    n_per_targ = 1000
    df_r_temp = df_r.groupby('target', group_keys=False).apply(
        lambda df: df.sample(n=min(df.shape[0], n_per_targ), random_state=42))

    x_tr = np.array(df_t.head(n_sup).append(df_r_temp).vecs.to_list())
    y_tr = np.array(df_t.head(n_sup).append(df_r_temp).label.to_list())

    #y_tr = to_categorical(y_tr)

    #print(f"{df.where(df.label == 0).dropna().target.value_counts()}")

    #print(f"x_target: {x_target.shape}\nx_ref: {x_ref.shape}\ny_ref: {y_ref.shape}\n")

    res_path = "/home/philipp/projects/dad4td/reports/one_class/all.tsv"
    classes = df_r.target.unique().shape[0]
    print(f"classes: {classes}")
    batchsize = 128
    epoch_num = 15
    epoch_report = 5
    feature_out = 64
    pred_mode = "nn"

    # get the loss for compactness
    original_loss = create_loss(classes, batchsize)

    # model creation
    model = create_model(loss="binary_crossentropy", n_in=x_target[0].shape[0])

    model_t = Model(inputs=model.input, outputs=model.output)

    model_r = Network(inputs=model_t.input,
                      outputs=model_t.output,
                      name="shared_layer")

    prediction = Dense(classes, activation='softmax')(model_t.output)
    model_r = Model(inputs=model_r.input, outputs=prediction)

    #latent_t = Dense(2, activation='relu')(model_t.output)
    #model_t = Model(inputs=model_t.input,outputs=latent_t)
    prediction_t = Dense(feature_out, activation='softmax')(model_t.output)
    model_t = Model(inputs=model_t.input, outputs=prediction_t)

    #optimizer = SGD(lr=5e-5, decay=0.00005)
    optimizer = Adam(learning_rate=5e-5)

    model_r.compile(optimizer=optimizer, loss="categorical_crossentropy")
    model_t.compile(optimizer=optimizer, loss=original_loss)

    model_t.summary()
    model_r.summary()

    ref_samples = np.arange(x_ref.shape[0])
    loss, loss_c = [], []
    epochs = []
    best_acc = 0
    print("training...")

    for epochnumber in range(epoch_num):
        x_r, y_r, lc, ld = [], [], [], []

        np.random.shuffle(x_target)

        np.random.shuffle(ref_samples)
        for i in range(len(x_ref)):
            x_r.append(x_ref[ref_samples[i]])
            y_r.append(y_ref[ref_samples[i]])
        x_r = np.array(x_r)
        y_r = np.array(y_r)

        for i in range(int(len(x_target) / batchsize)):
            batch_target = x_target[i * batchsize:i * batchsize + batchsize]
            batch_ref = x_r[i * batchsize:i * batchsize + batchsize]
            batch_y = y_r[i * batchsize:i * batchsize + batchsize]
            # target data
            lc.append(
                model_t.train_on_batch(batch_target,
                                       np.zeros((batchsize, feature_out))))

            # reference data
            ld.append(model_r.train_on_batch(batch_ref, batch_y))

        loss.append(np.mean(ld))
        loss_c.append(np.mean(lc))
        epochs.append(epochnumber)

        if epochnumber % epoch_report == 0 or epochnumber == epoch_num - 1:
            print(
                f"-----\n\nepoch : {epochnumber+1} ,Descriptive loss : {loss[-1]}, Compact loss : {loss_c[-1]}"
            )

            model_t.save_weights(
                '/home/philipp/projects/dad4td/models/one_class/model_t_smd_{}.h5'
                .format(epochnumber))
            model_r.save_weights(
                '/home/philipp/projects/dad4td/models/one_class/model_r_smd_{}.h5'
                .format(epochnumber))
            #test_b = model_t.predict(test_vecs)

            #od = OCSVM()
            # od.fit(test_b)

            #decision_scores = od.labels_

            # decision_scores = decision_scores.astype(float)

            labels = df_test["label"].astype(int).values

            # threshold = 0.5
            # scores = get_scores(dict(),labels, np.where(decision_scores > threshold, 0, 1), outlabel=0)
            if pred_mode == "svm":
                x_tr_pred = model_t.predict(x_tr)
                clf = SVC()
                clf.fit(x_tr_pred, y_tr)

                preds = model_t.predict(test_vecs)
                preds = clf.predict(preds)
            elif pred_mode == "nn":
                y_tr = y_tr.astype(int)
                print(y_tr)
                x_tr_pred = model_t.predict(x_tr)
                clf = create_sup_model(n_in=feature_out)
                clf.summary()
                clf.fit(x_tr_pred,
                        y=y_tr,
                        epochs=15,
                        batch_size=64,
                        verbose=True)

                decision_scores = model_t.predict(test_vecs)
                decision_scores = clf.predict(decision_scores)
                preds = decision_scores.astype(float)

                _ = plt.hist(preds, bins=10)
                plt.show()

            else:
                raise Exception(f"{pred_mode} must be one of svm, nn, osvm")

            scores = get_scores(dict(), labels, preds, outlabel=0)
            print(f"\n\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
            if scores["accuracy"] > best_acc:
                best_acc = scores["accuracy"]
                print(f"best_acc updated to: {best_acc}")
            normalize = "true"
            print(f"{confusion_matrix(labels, preds, normalize=normalize)}")
    result_df = result_df.append(dict(cclass=list(df_test.target.unique()),
                                      accuracy=best_acc),
                                 ignore_index=True)
    result_df.to_csv(res_path, sep="\t")
    return result_df
if __name__ == "__main__":

    parser = OptionParser()
    parser.add_option("-i",
                      dest="in_dir", type="string", default="data/gifs/test",
                      help="Load images from this directory")
    parser.add_option("-o",
                      dest="out_file", type="string", default="test.csv",
                      help="Save images to this file")
    (options, args) = parser.parse_args()


    face_cascade = cv2.CascadeClassifier('haar_data/haarcascade_frontalface_default.xml')


    scores = get_scores('data/json/', 'multi_hot')
    scores = vectorize_scores(scores)

    f = open(options.out_file, 'w')
    f.write('emotion,pixels\n')

    gifs = os.listdir(options.gifs_dir)

    for gif in gifs:
        frames = os.listdir(options.gifs_dir + '/' + gif)

        score = scores[gif]

        # Sample 10 equally-spaced frames if we have >10 frames
        lf = len(frames)
        if lf > 10:
def run_vcl(hidden_size,
            no_epochs,
            data_gen,
            coreset_method,
            coreset_size=0,
            batch_size=None,
            single_head=True,
            sd=0,
            lr=0.001):
    print("seed ", sd)
    in_dim, out_dim = data_gen.get_dims()
    print('in dim , out ', in_dim, out_dim)
    x_coresets, y_coresets = [], []
    x_testsets, y_testsets = [], []
    path_folder_result = create_path_file_result(lr, sd)

    all_acc = np.array([])
    print("max iter ", data_gen.max_iter)

    for task_id in range(data_gen.max_iter):
        x_train, y_train, x_test, y_test = data_gen.next_task()
        x_testsets.append(x_test)
        y_testsets.append(y_test)

        # Set the readout head to train
        head = 0 if single_head else task_id
        bsize = x_train.shape[0] if (batch_size is None) else batch_size

        # Train network with maximum likelihood to initialize first model
        if task_id == 0:
            ml_model = Vanilla_NN(in_dim, hidden_size, out_dim,
                                  x_train.shape[0])
            ml_model.train(x_train, y_train, task_id, no_epochs, bsize)
            mf_weights = ml_model.get_weights()
            mf_variances = None
            ml_model.close_session()

        # Select coreset if needed
        if coreset_size > 0:
            x_coresets, y_coresets, x_train, y_train = coreset_method(
                x_coresets, y_coresets, x_train, y_train, coreset_size)

        # Train on non-coreset data
        s_time = time.time()
        print("batch size ", bsize)
        mf_model = MFVI_NN(in_dim,
                           hidden_size,
                           out_dim,
                           x_train.shape[0],
                           prev_means=mf_weights,
                           prev_log_variances=mf_variances,
                           learning_rate=lr)
        mf_model.train(x_train, y_train, head, no_epochs, bsize)
        e_time = time.time()
        print("time train ", e_time - s_time)
        mf_weights, mf_variances = mf_model.get_weights()

        # Incorporate coreset data and make prediction
        acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets,
                               y_coresets, hidden_size, no_epochs, single_head,
                               batch_size)
        all_acc = utils.concatenate_results(acc, all_acc)
        print(all_acc)
        write_data_to_file(
            all_acc,
            path_folder_result + "/result_vcl_split_seed" + str(sd) + ".csv")

        mf_model.close_session()

    return all_acc
Beispiel #21
0
            #####
            # test UMAP and neural net
            docvecs_test, _ = umap_reduce(df_test["vecs"].to_list(), None,
                                          umap_model, **params)

            docvecs_test, _ = neuralnet(docvecs_test,
                                        None,
                                        nnet,
                                        n_out=1,
                                        loss="binary_crossentropy",
                                        **params)

            # get prediction scores
            threshold = 0.5
            scores = get_scores(dict(),
                                df_test["label"].astype(int).values,
                                np.where(docvecs_test > threshold, 1, 0),
                                outlabel=0)

            ####
            # write scores
            ####
            # write the scores to df and save
            scores.update(params)
            scores["n_class"] = n_class
            scores["data"] = "test"
            scores["threshold"] = threshold
            scores["doc2vec_model"] = doc2vec_model.model_name
            result_df = result_df.append(scores, ignore_index=True)
            result_df.to_csv(res_path, sep="\t")
            print(f"\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
Beispiel #22
0
        
            targets = captions[:, 1:] # removing the start token 

            targets = pack_padded_sequence(targets, [len(tar) - 1 for tar in targets], batch_first=True)[0]
            packed_preds = pack_padded_sequence(preds, [len(pred) - 1 for pred in preds], batch_first=True)[0]

            att_regularization = args.alpha_c * ((1 - alphas.sum(1))**2).mean()

            loss = cross_entropy_loss(packed_preds, targets)
            loss += att_regularization
            loss.backward()
            optimizer.step()

            total_caption_length = calculate_caption_lengths(word_dict, captions)
            losses.update(loss.item(), total_caption_length)
            if batch_idx % args.log_interval == 0:
                print('Train Batch: [{0}/{1}]\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                        batch_idx, len(train_loader), loss=losses))
            if debug:
                break
        
        # x = get_scores(model,train_loader)
        y = get_scores(model,val_loader,word_dict,idx_dict,device,debug)
        z = get_scores(model,test_loader,word_dict,idx_dict,device,debug)
        torch.save(model.state_dict(),Path(args.result_dir)/f"{epoch}.pth")
        print(f"epoch = {epoch} Val : {y} Test : {z}")



Beispiel #23
0
ground_truth_targets_val = np.load(join(input_path, 'targets_validation.npy'))
ground_truth_targets_tst = np.load(join(input_path, 'targets_test.npy'))

# Get model variables.
models_env = os.environ['models_exp']
seed_ensemble_env = os.environ['seed_ensemble_exp']
models = np.array(models_env.split(':')).astype(int)
seed_ensemble = np.array(seed_ensemble_env.split(':')).astype(int)

# Load the results of the ensemble in the validation set. 
predictions_mean_val, _ = load_results(models, 
	seed_ensemble, model_path, data='val')

# Find the best epoch in the validation set.
ensemble_mean_val = get_ensemble_mean(predictions_mean_val)
r2_val = get_scores(ensemble_mean_val, ground_truth_targets_val, data='val')
best_epoch = early_stopping(r2_val) 

# Load the results of the ensemble in the test set and compute mean and variance. 
predictions_mean_tst, predictions_variance_tst = load_results(models, 
	    seed_ensemble, model_path, data='tst', n_epoch=best_epoch)
ensemble_mean_tst = get_ensemble_mean(predictions_mean_tst)
ensemble_variance_tst = get_ensemble_variance(predictions_mean_tst, 
        predictions_variance_tst)
r2_tst, rmse_tst, mae_tst, xfold2_tst = get_scores(ensemble_mean_tst, 
        ground_truth_targets_tst, data='tst')

# Save table containing performance metrics in the test set.
with open(join(output_path, 'prediction_performance.txt'), 'w') as f_out:
    f_out.write('data\tr2\trmse\tmae\t%-within-2-fold\n')
    f_out.write('test set\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\n'.format(
Beispiel #24
0
def run_vcl(hidden_size,
            no_epochs,
            data_gen,
            coreset_method,
            coreset_size=0,
            batch_size=None,
            single_head=True):
    in_dim, out_dim = data_gen.get_dims()
    x_coresets, y_coresets = [], []
    x_testsets, y_testsets = [], []

    all_acc = np.array([])

    for task_id in list(range(data_gen.max_iter)):
        x_train, y_train, x_test, y_test = data_gen.next_task()
        x_testsets.append(x_test)
        y_testsets.append(y_test)

        # Set the readout head to train
        head = 0 if single_head else task_id
        bsize = x_train.shape[0] if (batch_size is None) else batch_size

        # Train network with maximum likelihood to initialize first model
        if task_id == 0:
            mf_variances = None
            mf_weights = None

        # Select coreset if needed
        if coreset_size > 0:
            x_coresets, y_coresets, x_train, y_train = coreset_method(
                x_coresets, y_coresets, x_train, y_train, coreset_size)

        # Train on non-coreset data
        mf_model = CVI_NN(in_dim,
                          hidden_size,
                          out_dim,
                          x_train.shape[0],
                          prev_means=mf_weights,
                          prev_log_variances=mf_variances)
        no_epochs = 0 if task_id == 1 else 10
        mf_model.train(x_train, y_train, head, no_epochs, bsize)
        mf_weights, mf_variances = mf_model.create_weights()
        prev_mf_weights, prev_mf_variances = mf_weights, mf_variances
        # sess = mf_model.sess
        # with sess.as_default():
        #     if not (mf_weights and mf_variances):
        #         print(sess.run(mf_weights))
        #         print(sess.run(mf_variances))
        #         mf_weights = sess.run(mf_weights)
        #         mf_variances = sess.run(mf_variances)
        #import pdb; pdb.set_trace()

        # Incorporate coreset data and make prediction
        acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets,
                               y_coresets, hidden_size, no_epochs, single_head,
                               batch_size)
        all_acc = utils.concatenate_results(acc, all_acc)
        print(acc)
        mf_model.close_session()

    return all_acc
Beispiel #25
0
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True, train_info = None):
    in_dim, out_dim = data_gen.get_dims()
    x_coresets, y_coresets = [], []
    x_testsets, y_testsets = [], []

    all_acc = np.array([])
    all_acc_for_save = np.zeros((data_gen.max_iter, data_gen.max_iter), dtype=np.float32)
    
    for task_id in range(data_gen.max_iter):
        x_train, y_train, x_test, y_test = data_gen.next_task()
        x_testsets.append(x_test)
        y_testsets.append(y_test)

        # Set the readout head to train
        head = 0 if single_head else task_id
        bsize = x_train.shape[0] if (batch_size is None) else batch_size

        # Train network with maximum likelihood to initialize first model
        if task_id == 0:
            print('Vanilla NN train for task 0!')
            ml_model = Vanilla_NN(in_dim, hidden_size, out_dim, x_train.shape[0])
            ml_model.train(x_train, y_train, task_id, no_epochs, bsize)
            mf_weights = ml_model.get_weights()
            mf_variances = None
            ml_model.close_session()

        # Select coreset if needed
        if coreset_size > 0:
            x_coresets, y_coresets, x_train, y_train = coreset_method(x_coresets, y_coresets, x_train, y_train, coreset_size)
        print('Current task : {}'.format(task_id))
        # Train on non-coreset data
        mf_model = MFVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances)
        mf_model.train(x_train, y_train, head, no_epochs, bsize)
        mf_weights, mf_variances = mf_model.get_weights()

        # Incorporate coreset data and make prediction
        acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size)
        all_acc = utils.concatenate_results(acc, all_acc)
        
        
        for u in range(task_id + 1):

            print('>>> Test on task {:2d} : acc={:5.1f}% <<<'.format(u, 100 * acc[u]))
            all_acc_for_save[task_id, u] = acc[u]

        # Save

        log_name = '{}_{}_{}_{}epochs_batch{}_{}_{}coreset_{}'.format(train_info['date'], train_info['experiment'], train_info['tasknum'], no_epochs, train_info['batch'], train_info['coreset_method'], coreset_size, train_info['trial'])
        
        if single_head:
            log_name += '_single'
        
        save_path = './results/' + log_name + '.txt'
        print('Save at ' + save_path)
        np.savetxt(save_path, all_acc_for_save, '%.4f')
        
        
        
        mf_model.close_session()

    return all_acc
Beispiel #26
0
def get_scores():
    return utils.get_scores(mongo.db)