Esempio n. 1
0
def test():
    # testing the output of the vision model
    ids_fn_sem = ddir.sp_testSem_ids_fn
    ids_fn = ddir.sp_test_ids_fn
    with open(ids_fn, "r") as fo:
        ids = map(lambda x: x.strip('\n'), fo.readlines())
    with open(ids_fn_sem, "r") as fo:
        ids_sem = map(lambda x: x.strip('\n'), fo.readlines())
    value_bow, gtKwDict, captionsDict = data_io.get_semValues(
        ddir.labels_csv, ddir.keywords_test)
    count_bow = data_io.get_semCounts(ddir.counts_csv, ddir.keywords_test)
    vis_multi_sem = np.zeros([len(ids_sem), len(mapping)])
    vis_multi_exact, gt_multi = [], []
    for i in range(len(ids)):
        # caption = ' '.join(captions_dict[ids[i]])
        idx = [ids[i]]
        vis_vec = np.stack(map(lambda x: vision_bow_vec[x[4:-2]], idx), axis=0)
        caption_vec = np.stack(
            map(lambda x: caption_bow_vec1[x],
                idx), axis=0)  # GT for evaluating exact match kw pred metrics
        vis_vec_mapped = vis_vec[0][mapping]
        vis_multi_exact.append(vis_vec)
        gt_multi.append(caption_vec)
        z = idx[0].split("_")
        del z[0]
        idxnew = "_".join(z)
        if (idxnew in ids_sem):
            vis_multi_sem[ids_sem.index(idxnew)] = vis_vec_mapped

    vis_multi_exact, gt_multi = np.concatenate(
        vis_multi_exact, axis=0), np.concatenate(gt_multi, axis=0)

    eer, ap, prec10, precN = utils.get_metrics(vis_multi_exact.T, gt_multi.T)
    pcont = "Overall ratings: EER: %f, Average precision: %f, Precision@10: %f, Precision@N: %f" % (
        eer, ap, prec10, precN)
    print(pcont)
    with open("vis_exact.csv", "a+") as fo:
        fo.write(
            str(prec10 * 100) + ',' + str(precN * 100) + ',' + str(eer * 100) +
            ',' + str(ap * 100) + '\n')

    eer, ap, spearman, prec10, precN = utils.get_metrics(
        vis_multi_sem, value_bow, count_bow)
    pcont = "Subjective ratings: EER: %f, Average precision: %f, Precision@10: %f, Precision@N: %f, Spearman's rho: %f" % (
        eer, ap, prec10, precN, spearman)
    print(pcont)
    with open("vision_sem.csv", "a+") as fo:
        fo.write(
            str(spearman) + ',' + str(prec10 * 100) + ',' + str(precN * 100) +
            ',' + str(eer * 100) + ',' + str(ap * 100) + '\n')
Esempio n. 2
0
def eval_model(args):
    with tf.Session() as sess:
        iterator = BigEarthNet(args['test_tf_record_files'],
                               args['batch_size'], 1, 0,
                               args['label_type']).batch_iterator
        nb_iteration = int(
            np.ceil(float(args['test_size']) / args['batch_size']))
        iterator_ins = iterator.get_next()

        model = importlib.import_module('models.' +
                                        args['model_name']).DNN_model(
                                            args['label_type'],
                                            args['modality'])
        model.create_network()

        variables_to_restore = tf.global_variables()
        metric_names, metric_means, metric_update_ops = get_metrics(
            model.multi_hot_label, model.predictions, model.probabilities)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        model_saver = tf.train.Saver(max_to_keep=0,
                                     var_list=variables_to_restore)
        model_file = args['model_file']
        model_saver.restore(sess, model_file)

        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(
            os.path.join(args['out_dir'], 'logs', 'test'), sess.graph)

        iteration_idx = 0

        progress_bar = tf.contrib.keras.utils.Progbar(target=nb_iteration)
        eval_res = {}
        while True:
            try:
                batch_dict = sess.run(iterator_ins)
                iteration_idx += 1
                progress_bar.update(iteration_idx)
            except tf.errors.OutOfRangeError:
                print()
                means = sess.run(metric_means[0])
                for idx, name in enumerate(metric_names[0]):
                    eval_res[name] = str(means[idx])
                    print(name, means[idx])
                break

            sess_res = sess.run([metric_update_ops, summary_op] +
                                metric_means[1],
                                feed_dict=model.feed_dict(batch_dict))
            summary_writer.add_summary(sess_res[1], iteration_idx)
            metric_means_res = sess_res[2:]

        for idx, name in enumerate(metric_names[1]):
            eval_res[name] = str(metric_means_res[idx])
            print(name, metric_means_res[idx])

        with open(os.path.join(args['out_dir'], 'eval_result.json'),
                  'wb') as f:
            json.dump(eval_res, f)
    def collect(self):
        # Request data from ambari Collect Host API
        # Request exactly the System level information we need from node
        # beans returns a type of 'List'
        try:
            count = 0
            # In case no metrics we need in the jmx url, a time sleep and while-loop was set here to wait for the KEY metrics
            while count < 5:
                beans = utils.get_metrics(self._url)
                if 'init_total_count_tables' not in beans:
                    count += 1
                    time.sleep(1)
                    continue
                else:
                    break
        except:
            logger.info("Can't scrape metrics from url: {0}".format(self._url))
        else:
            pass
        finally:
            # set up all metrics with labels and descriptions.
            self._setup_labels(beans)

            # add metric value to every metric.
            self._get_metrics(beans)

            # update namenode metrics with common metrics
            common_metrics = common_metrics_info(self._cluster, beans, "hive",
                                                 "hiveserver2")
            self._hadoop_hiveserver2_metrics.update(common_metrics())

            for i in range(len(self._merge_list)):
                service = self._merge_list[i]
                for metric in self._hadoop_hiveserver2_metrics[service]:
                    yield self._hadoop_hiveserver2_metrics[service][metric]
Esempio n. 4
0
def main():
    cluster = "cluster_indata"
    beans = utils.get_metrics("http://10.110.13.164:50070/jmx")
    component = "hdfs"
    service = "namenode"
    common_metrics = common_metrics_info(cluster, beans, component, service)
    print common_metrics()
Esempio n. 5
0
def run_cv_pred(X, y, clf, n_folds):
    """
    Run n-fold cross validation returning a prediction for every row of X
    :param X: A scipy sparse feature matrix
    :param y: The target labels corresponding to rows of X
    :param clf: The
    :param n_folds:
    :return:
    """
    # Construct a kfolds object
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True)
    splits = skf.split(X, y)
    y_pred = y.copy()

    # Iterate through folds
    for idx, (train_index, test_index) in enumerate(splits):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        macro, micro = utils.get_metrics(preds, y[test_index])
        print 'run ', idx
        print 'macro: ', macro
        print 'micro: ', micro
        y_pred[test_index] = preds

    return y_pred
Esempio n. 6
0
def testSem():
    network.eval()
    ids_fn_sem = ddir.sp_testSem_ids_fn
    ids_fn = ddir.sp_test_ids_fn
    with open(ids_fn, "r") as fo:
        ids = map(lambda x: x.strip('\n'), fo.readlines())
    with open(ids_fn_sem, "r") as fo:
        ids_sem = map(lambda x: x.strip('\n'), fo.readlines())

    mapping = data_io.get_mapping(ddir.flickr8k_keywords, ddir.keywords_test)
    value_bow = data_io.get_semValues(ddir.labels_csv, ddir.keywords_test)
    count_bow = data_io.get_semCounts(ddir.counts_csv, ddir.keywords_test)
    pred_multi = np.zeros([len(ids_sem), len(mapping)])
    for i in range(len(ids)):
        idx = [ids[i]]
        z = idx[0].split("_")
        del z[0]
        idxnew = "_".join(z)
        if (idxnew in ids_sem):
            Xs, _ = data_io.load_mfcc(ddir.mfcc_dir, idx, args.n_pad)
            Xs = np.transpose(Xs, (0, 2, 1))
            Ys = np.stack(map(lambda x: vision_bow_vec[x[4:-2]], idx), axis=0)
            caption_Ys = np.stack(map(lambda x: caption_bow_vec[x], idx),
                                  axis=0)
            pred = getKWprob(Xs)
            predMapped = pred[0][mapping]
            pred_multi[ids_sem.index(idxnew)] = predMapped
    eer, ap, spearman, prec10, precN = utils.get_metrics(
        pred_multi, value_bow, count_bow)
    pcont = "Subjective ratings: EER: %f, Average precision: %f, Precision@10: %f, Precision@N: %f, Spearman's rho: %f" % (
        eer, ap, prec10, precN, spearman)
    print(pcont)
    with open(saveLog, "a+") as fo:
        fo.write(pcont + "\n")
def evaluate_test_sample(X, y, clf, nreps, name, results, train_pct):
    """
    Calculate results for this clf at various train / test split percentages
    :param X: features
    :param y: targets
    :param clf: detector
    :param nreps: number of random repetitions
    :param name: name of the detector
    :param results: A tuple of Pandas DataFrames containing (macro, micro) F1 results
    :param train_pct: The percentage of the data used for training
    :return: A tuple of Pandas DataFrames containing (macro, micro) F1 results
    """
    seed = 0
    for rep in range(nreps):
        # setting a random seed will cause the same sample to be generated each time
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_pct, random_state=seed, stratify=y)
        seed += 1
        clf.fit(X_train, y_train)
        try:  # Gradient boosted trees do not accept sparse matrices in the predict function currently
            preds = clf.predict(X_test)
        except TypeError:
            preds = clf.predict(X_test.todense())
        macro, micro = utils.get_metrics(y_test, preds, auc=False)
        results[0].loc[name, rep] = macro
        results[1].loc[name, rep] = micro
    return results
    def train(self, ep: int):
        '''训练模型

        Args:
            ep(int): 当前epoch
        '''
        self.model.train()
        size = len(self.train_loader)
        # 执行训练
        for step, (X, y, _) in tqdm(enumerate(self.train_loader),
                                    desc='Epoch {:3d}'.format(ep),
                                    total=size):
            X = X.to(self.device)  # type: torch.Tensor
            y = y.to(self.device)  # type: torch.Tensor
            self.optimizer.zero_grad()
            y_ = self.model(X)  # type: torch.Tensor
            loss = self.criterion(y_, y)  # type: torch.Tensor
            loss.backward()
            self.optimizer.step()

            y_ = y_.argmax(dim=1).cpu().numpy()
            y = y.cpu().numpy()

            # 计算运行时指标
            miou, _, pacc = get_metrics(y, y_)
            # 输出到tensorboard
            n_iter = ep * size + step
            self.writer.add_scalar('train/pacc', pacc, n_iter)
            self.writer.add_scalar('train/mIoU', miou, n_iter)
            self.writer.add_scalar('train/loss', loss.item(), n_iter)
    def validate(self, ep: int):
        '''验证模型

        Args:
            ep(int): 当前epoch
        '''
        mious, paccs = [], []
        total_loss = 0
        self.model.eval()

        with torch.no_grad():
            for X, y, _ in tqdm(self.val_loader, desc='Validating'):
                X, y = X.to(self.device), y.to(self.device)
                y_ = self.model(X)
                loss = self.criterion(y_, y)
                total_loss += loss.item()
                y_ = y_.argmax(dim=1)
                y_gd = y.cpu().numpy()
                y_pred = y_.cpu().numpy()
                miou, _, pacc = get_metrics(y_gd, y_pred)
                mious.append(miou)
                paccs.append(pacc)

        avg_loss = total_loss / len(self.val_loader)
        miou = np.average(mious)
        pacc = np.average(paccs)

        print(ep, miou, pacc)

        # 输出信息
        self.writer.add_scalar('test/pacc', pacc, ep)
        self.writer.add_scalar('test/mIoU', miou, ep)
        self.writer.add_scalar('test/avg_loss', avg_loss, ep)
Esempio n. 10
0
    def collect(self):
        self._clear_init()
        # 发送HTTP请求从JMX URL中获取指标数据。
        # 获取JMX中对应bean JSON数组。
        try:
            # 发起HTTP请求JMX JSON数据
            beans = utils.get_metrics(self._url)
        except:
            logger.info("Can't scrape metrics from url: {0}".format(self._url))
            pass
        else:
            # 设置监控需要关注的每个MBean,并设置好指标对应的标签以及描述
            self._setup_metrics_labels(beans)

            # 设置每个指标值
            self._get_metrics(beans)

            # 将通用的指标更新到NameNode对应的指标中
            common_metrics = common_metrics_info(self._cluster, beans, "hdfs",
                                                 "namenode")
            self._hadoop_namenode_metrics.update(common_metrics())

            # 遍历每一个指标分类(包含NameNode以及Common的指标分类)
            # 返回每一个指标和标签
            for i in range(len(self._merge_list)):
                service = self._merge_list[i]
                for metric in self._hadoop_namenode_metrics[service]:
                    yield self._hadoop_namenode_metrics[service][metric]
def initialised_embedding_scenario():
    emd1 = pd.read_csv('../../local_results/tf0.emd',
                       header=None,
                       index_col=0,
                       skiprows=1,
                       sep=" ")
    del emd1.index.name
    emd2 = pd.read_csv('../../local_results/tf_1in10000_init.emd',
                       header=None,
                       index_col=0,
                       skiprows=1,
                       sep=" ")
    del emd2.index.name
    feature_path = '../../local_resources/features_1in10000.tsv'
    rf_features = pd.read_csv(feature_path, sep='\t', index_col=0)
    temp1 = emd1.join(rf_features, how='left')
    y_train = temp1['target_churned'].values.astype(int)
    print 'training counts', pd.Series(y_train).value_counts()
    test_emd = utils.subtract_intersection(emd2, emd1)
    # temp2 = test_emd.join(rf_features, how='left')
    temp2 = emd2.join(rf_features, how='left')
    y_test = temp2['target_churned'].values.astype(int)
    print 'test counts', pd.Series(y_test).value_counts()

    for clf in classifiers:
        clf.fit(emd1.values, y_train)
        preds = clf.predict_proba(emd2.values)[:, 1]
        # preds = clf.predict_proba(test_emd.values)[:, 1]
        print len(preds), preds.sum()
        macro, micro = utils.get_metrics(y_test, preds)
        print macro, micro
Esempio n. 12
0
    def collect(self):
        # Request data from ambari Collect Host API
        # Request exactly the System level information we need from node
        # beans returns a type of 'List'
        try:
            beans = utils.get_metrics(self._url)
        except:
            logger.info("Can't scrape metrics from url: {0}".format(self._url))
            pass
        else:
            # set up all metrics with labels and descriptions.
            self._setup_labels(beans)

            # add metric value to every metric.
            self._get_metrics(beans)

            # update namenode metrics with common metrics
            common_metrics = common_metrics_info(self._cluster, beans, "hbase",
                                                 "regionserver")
            self._hadoop_regionserver_metrics.update(common_metrics())

            for i in range(len(self._merge_list)):
                service = self._merge_list[i]
                for metric in self._hadoop_regionserver_metrics[service]:
                    yield self._hadoop_regionserver_metrics[service][metric]
Esempio n. 13
0
def run_repetitions(X, y, names, clf, reps, train_pct=0.8):
    """
    Run repeated experiments on random train test splits of the data
    :param X: an iterable of numpy arrays
    :param y: a numpy array of target variables
    :param clf: a scikit-learn classifier
    :param names: the names of the data sets. Size should match data
    :param reps: the number of repetitions to run for each dataset
    :param train_pct: the percentage of the data to use for training. The rest will be held out for the test set.
    :return:
    """
    results = np.zeros(shape=(len(X), reps))
    min_split = min(train_pct, 1 - train_pct)
    assert len(
        y
    ) * min_split > 1, 'Only {} data points is not enough for a train split of {}'.format(
        len(y), train_pct)
    for rep in range(reps):
        for idx, dataset in enumerate(X):
            try:
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset, y, train_size=train_pct, stratify=y)
            except ValueError:
                print 'could not stratify as too many classes for train percentage {}'.format(
                    train_pct)
                print 'performing unstratified train test split instead'
                X_train, X_test, y_train, y_test = train_test_split(
                    dataset, y, train_size=train_pct)
            clf.fit(X_train, y_train)
            probs = clf.predict_proba(X_test)
            macro, micro = utils.get_metrics(y_test, probs, auc=False)
            results[idx, rep] = macro
    train = []
    std_error = sem(results, axis=1)
    mean = results.mean(axis=1)
    for idx, dataset in enumerate(X):
        clf.fit(dataset, y)
        probs = clf.predict_proba(dataset)
        macro, micro = utils.get_metrics(y, probs, auc=False)
        train.append(macro)

    df = pd.DataFrame(data=results, index=names)
    df['mean'] = mean
    df['train'] = train
    df['sde'] = std_error

    return df
Esempio n. 14
0
def main(argv):
    config_file = argv[1]

    global env

    env = Env()

    env.read_env(_get_env_path(config_file))

    yaml.add_implicit_resolver('!envvar',
                               envvar_matcher,
                               Loader=yaml.FullLoader)
    yaml.add_constructor('!envvar', envvar_constructor, Loader=yaml.FullLoader)
    config = yaml.load(open(config_file), Loader=yaml.FullLoader)

    if argv[2] == "upload":
        return upload(config, argv)

    ffrom = datetime.date(*map(int, argv[2].split("-")))
    tto = datetime.date(*map(int, argv[3].split("-")))
    extra_args = argv[4:]

    skip_metrics = "--skip-metrics" in extra_args

    js_metrics = codecs.open(config["js_metrics"], "w", encoding="utf-8")

    js_metrics.write("metrics = [\n")

    for metric in get_metrics(config):
        metric_class = plugins.get_metric_class(metric["type"])
        if not metric_class:
            print("Error no existe la métrica de tipo %s" % metric["type"],
                  file=sys.stderr)
            sys.exit(3)
        metric_obj = metric_class(config, metric, metric["name"])
        if not skip_metrics:
            print("Generating metric %s" % metric["name"])
            data = metric_obj.generate(ffrom, tto)

            save_data(data, metric["name"], config)

        js_metrics.write(metric_obj.js_line())

    js_metrics.write("];\n")
    js_metrics.write("\n")
    js_metrics.write("\n")

    js_metrics.write("pills = [\n")
    for pill in config["pills"]:
        js_metrics.write("jQuery.parseJSON('%s'),\n" % json.dumps(pill))
    js_metrics.write("];\n")
    js_metrics.write("\n")

    print("Saving js_metrics file")
    js_metrics.write("periodChoices = [\n")
    for period_choice in config["period_choices"]:
        js_metrics.write('    {value: %(value)s, name: "%(name)s"},\n' %
                         period_choice)
    js_metrics.write("];\n")
Esempio n. 15
0
def test_index_factor_return():
    print("test index factor return")
    files = [f for f in os.listdir(const.INDEX_FACTOR_DIR)]
    pnl = utils.get_all_panel(const.INDEX_FACTOR_DIR, files)
    # 得到收益率
    pnl.ix[:, :, 'return'] = pnl.minor_xs('close').pct_change()
    # 计算Momentum因子收益率
    k = 60
    df = pnl.minor_xs('return').rolling(window=k).mean()
    return_df = pnl.minor_xs('return')
    daily_return = analysis.factor_return(df, return_df, threshold=0.2)
    daily_return = daily_return[daily_return != 0]
    # print daily_return
    utils.get_metrics(daily_return)
    acc_ret = utils.get_accumulated_return(daily_return)
    acc_ret.plot()
    plt.show()
def test(epoch, subset):
    network.eval()
    pcont4 = " "
    ids_fn = ddir.sp_dev_ids_fn if subset == "dev" else ddir.sp_test_ids_fn
    with open(ids_fn, "r") as fo:
        ids = map(lambda x: x.strip(), fo.readlines())
    pred_multi, grt_multi, pred_multiBoW, grt_multiBoW, vis_multi,  = [], [], [], [], [], [], []
    for i in range(0, len(ids), args.test_batch_size):
        idx = ids[i:i + args.test_batch_size]
        Xs, _ = data_io.load_mfcc(ddir.mfcc_dir, idx, args.n_pad)
        Xs = np.transpose(Xs, (0, 2, 1))
        vision_Ys = np.stack(map(lambda x: vision_bow_vec[x[4:-2]], idx),
                             axis=0)  # GT from vision model
        caption_Ys1 = np.stack(
            map(lambda x: caption_bow_vec1[x],
                idx), axis=0)  # GT for evaluating exact match kw pred metrics
        caption_Ys2 = np.stack(map(lambda x: caption_bow_vec2[x], idx),
                               axis=0)  # GT for bow loss
        if (args.mt):
            l, lBoW, pred, predBoW = run_net(Xs, vision_Ys, caption_Ys2)
        else:
            l, pred = run_net(Xs, vision_Ys)
        pred_multi.append(pred)
        grt_multi.append(caption_Ys1)
        if (args.mt):
            pred_multiBoW.append(predBoW)
            grt_multiBoW.append(caption_Ys2)

    if (args.mt):
        pred_multiBoW, grt_multiBoW = np.concatenate(
            pred_multiBoW, axis=0), np.concatenate(grt_multiBoW, axis=0)
        pred_multiBoW = np.concatenate(
            (pred_multiBoW,
             np.zeros((pred_multiBoW.shape[0], grt_multiBoW.shape[1] -
                       pred_multiBoW.shape[1])).astype(np.float32)),
            axis=1)

    if (subset == 'test'):  # On keyword spotting
        # precisionBoW, recallBoW, fscoreBoW = utils.get_fscore(pred_multiBoW >= args.threshold, grt_multiBoW)
        # pcont3 = "Threshold = %.1f: precision BoW: %.3f, recall BoW: %.3f, fscore BoW: %.3f" % (args.threshold, precisionBoW, recallBoW, fscoreBoW)
        eer, ap, prec10, precN = utils.get_metrics(pred_multiBoW.T,
                                                   grt_multiBoW.T)
        pcont5 = "Overall ratings (on BoW): EER: %f, Average precision: %f, Precision@10: %f, Precision@N: %f" % (
            eer, ap, prec10, precN)
        with open("aux_exact.csv", "a+") as fo:
            fo.write(args.mtType + ',' + str(args.alpha) + ',' +
                     str(args.n_bow2) + ',' + str(prec10 * 100) + ',' +
                     str(precN * 100) + ',' + str(eer * 100) + ',' +
                     str(ap * 100) + '\n')
        # print(pcont3+"\n")
        print(pcont5 + "\n")
        # with open(saveLog, "a+") as fo:
        # fo.write("\n"+pcont5+"\n")

    return 0
Esempio n. 17
0
def train_fg(model,
             optim,
             loss,
             features,
             labels,
             train_g,
             test_g,
             test_mask,
             device,
             n_epochs,
             thresh,
             compute_metrics=True):
    """
    A full graph verison of RGCN training
    """

    duration = []
    for epoch in range(n_epochs):
        tic = time.time()
        loss_val = 0.

        pred = model(train_g, features.to(device))

        l = loss(pred, labels)

        optim.zero_grad()
        l.backward()
        optim.step()

        loss_val += l

        duration.append(time.time() - tic)
        metric = evaluate(model, train_g, features, labels, device)
        print(
            "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | f1 {:.4f} ".format(
                epoch, np.mean(duration), loss_val, metric))

    class_preds, pred_proba = get_model_class_predictions(model,
                                                          test_g,
                                                          features,
                                                          labels,
                                                          device,
                                                          threshold=thresh)

    if compute_metrics:
        acc, f1, p, r, roc, pr, ap, cm = get_metrics(class_preds, pred_proba,
                                                     labels.numpy(),
                                                     test_mask.numpy(), './')
        print("Metrics")
        print("""Confusion Matrix:
                                {}
                                f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, acc: {:.4f}, roc: {:.4f}, pr: {:.4f}, ap: {:.4f}
                             """.format(cm, f1, p, r, acc, roc, pr, ap))

    return model, class_preds, pred_proba
def run_cv_pred(X, y, clf, n_folds, name, results, debug=True):
    """
    Run n-fold cross validation returning a prediction for every row of X
    :param X: A scipy sparse feature matrix
    :param y: The target labels corresponding to rows of X
    :param clf: The
    :param n_folds:
    :return:
    """
    # Construct a kfolds object
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True)
    splits = skf.split(X, y)
    y_pred = np.zeros(shape=(len(y), 2))

    # Iterate through folds
    for idx, (train_index, test_index) in enumerate(splits):
        X_train, X_test = X[train_index, :], X[test_index, :]
        assert len(set(train_index).intersection(test_index)) == 0
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf.fit(X_train, y_train)
        try:  # Gradient boosted trees do not accept sparse matrices in the predict function currently
            preds = clf.predict_proba(X_test)
        except TypeError:
            preds = clf.predict_proba(X_test.todense())
        macro, micro = utils.get_metrics(y[test_index], preds)
        results[0].loc[name, idx] = macro
        results[1].loc[name, idx] = micro
        y_pred[test_index, :] = preds

    # add on training results
    clf.fit(X, y)
    try:  # Gradient boosted trees do not accept sparse matrices in the predict function currently
        preds = clf.predict_proba(X)
    except TypeError:
        preds = clf.predict_proba(X.todense())
    macro, micro = utils.get_metrics(y, preds)
    results[0].loc[name, n_folds] = macro
    results[1].loc[name, n_folds] = micro
    # y_pred[test_index] = preds

    return y_pred, results
Esempio n. 19
0
 def collect_metric(counter):
     counter += 1
     timer = threading.Timer(_period, collect_metric, (counter, ))
     timer.start()
     if counter >= count:
         timer.cancel()
     try:
         data = utils.get_metrics(self.db_info)
         internal_metrics.append(data)
     except Exception as err:
         print "[GET Metrics]Exception:", err
Esempio n. 20
0
 def collect_metric(counter):
     counter += 1
     timer = threading.Timer(_period, collect_metric, (counter, ))
     timer.start()
     if counter >= count:
         timer.cancel()
     try:
         data = utils.get_metrics(self.db_info)
         internal_metrics.append(data)
     except MySQLdb.Error as e:
         print("[GET Metrics]Exception:%s" % e.message)
Esempio n. 21
0
def test_accuracy(x, y, testx, testy, dist='Multinomial'):
    py = []
    if dist in ('Multinomial', 'MultivariateBernoulli'):
        probs = get_class_conditional_probs(x, y, dist=dist)
        for q in testx:
            py.append(classify(x, y, q, dist=dist, probs=probs))
    elif dist == 'Normal':
        mean, var = get_class_conditional_probs(x, y, dist=dist)
        for q in testx:
            py.append(classify(x, y, q, dist=dist, mean=mean, var=var))
    return utils.get_metrics(testy, np.array(py))
def draw_output(img_path: str, imgs: dict, show_output: bool = True):
    '''绘制原始图像和ground truth到一个文件中

    Args:
        img_path(str): 原始图像路径,用来获取标签路径
        imgs(dict<str: np.ndarray>): 输出的图片信息
    '''
    assert 'Predict Image' in imgs, '必须包含预测图像'
    output_path = './output'
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    gd_path = img_path.replace('img', 'label')
    gd_array = np.array(
        Image.open(gd_path)) if os.path.exists(gd_path) else None

    # 将Ground Truth加入图像中
    if gd_array is not None:
        imgs['Ground Truth'] = gd_array
        pred_img = imgs['Predict Image']

    # 绘制图例
    legend_array = np.zeros((100, 800), dtype=np.uint8)
    for i in range(config.num_classes):
        legend_array[:, i * 200:(i + 1) * 200] = i
    legend_array = legend_array / (config.num_classes - 1) * 255
    imgs['Legend'] = legend_array

    fig_w, fig_h = 15, int(6 * np.ceil(len(imgs) / 3))  # 宽度固定为15,高为6的整数倍
    fig = plt.figure(figsize=(fig_w, fig_h))
    # 绘制图像
    for i, (title, img) in enumerate(imgs.items()):
        ax = fig.add_subplot(31 + i + np.ceil(len(imgs) / 3) * 100)
        ax.set_title(title)
        if len(img.shape) == 2:
            ax.imshow(img / 3 * 255, cmap='bone')
        else:
            ax.imshow(img)
    # 计算各项指标
    if gd_array is not None:
        miou, ious, acc = get_metrics(gd_array, pred_img)
        fig.suptitle('$mIoU={:.2f}, acc={:.2f}$\n$IoUs={}$'.format(
            miou, acc, ['%.2f' % x for x in ious]))
    # 获取原始文件名,并根据文件名得到输出目录信息
    filename = os.path.basename(img_path)
    _, _, _, parent_img = extract_info_from_filename(filename)
    output_path = os.path.join(output_path, parent_img.replace('.png',
                                                               ''))  # 按父文件名分类
    output_filename = os.path.join(output_path, filename)
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    fig.savefig(output_filename)
    if show_output:
        print('Output has been saved to {}.'.format(output_filename))
Esempio n. 23
0
def predict(config, args):
    gpu_manage(args)
    dataset = Dataset(args.test_dir)
    data_loader = DataLoader(dataset=dataset,
                             num_workers=config.threads,
                             batch_size=1,
                             shuffle=False)

    gen = UNet(in_ch=config.in_ch, out_ch=config.out_ch, gpu_ids=args.gpu_ids)

    param = torch.load(args.pretrained)
    gen.load_state_dict(param)
    criterionMSE = nn.MSELoss()

    if args.cuda:
        gen = gen.cuda(0)
        criterionMSE = criterionMSE.cuda(0)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    avg_mse = 0
    avg_psnr = 0
    avg_ssim = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(data_loader)):
            input_, ground_truth = Variable(batch[0]), Variable(batch[1])
            filename = batch[2][0]
            input_ = F.interpolate(input_, size=256).to(device)
            ground_truth = F.interpolate(ground_truth, size=256).to(device)

            output = gen(input_)

            save_image_from_tensors(input_, output, ground_truth,
                                    config.out_dir, i, 0, filename)
            mse, psnr, ssim = get_metrics(output, ground_truth, criterionMSE)
            print(filename)
            print('MSE: {:.4f}'.format(mse))
            print('PSNR: {:.4f} dB'.format(psnr))
            print('SSIM: {:.4f} dB'.format(ssim))

            avg_mse += mse
            avg_psnr += psnr
            avg_ssim += ssim

    avg_mse = avg_mse / len(data_loader)
    avg_psnr = avg_psnr / len(data_loader)
    avg_ssim = avg_ssim / len(data_loader)

    print('Average MSE: {:.4f}'.format(avg_mse))
    print('Average PSNR: {:.4f} dB'.format(avg_psnr))
    print('Average SSIM: {:.4f} dB'.format(avg_ssim))
Esempio n. 24
0
def trainval(data_path, transformer_checkpoint_path, model_checkpoint_path):
    data = pd.read_csv(data_path)
    data["text"] = data["text"].apply(preprocess_text)
    
    #train
    transformer = TfidfVectorizer(ngram_range=(1, 2), max_features=100000)
    X_train = transformer.fit_transform(data[data["split"] == "train"]["text"].values)

    model = LogisticRegression(C=5e1, solver='lbfgs', random_state=42, n_jobs=8)
    model.fit(X_train, data[data["split"] == "train"]["sentiment"])


    #validation
    X_val = transformer.transform(data[data["split"] == "val"]["text"].values)
    group_val = data[data["split"] == "val"]["source"].values 
    y_val = data[data["split"] == "val"]["sentiment"].values
    preds = model.predict(X_val)
    
    get_metrics(y_val, preds, group_val)
    
    joblib.dump(transformer, transformer_checkpoint_path)
    joblib.dump(model, model_checkpoint_path)
def testSem():
    network.eval()
    ids_fn_sem = ddir.sp_testSem_ids_fn
    ids_fn = ddir.sp_test_ids_fn
    with open(ids_fn, "r") as fo:
        ids = map(lambda x: x.strip('\n'), fo.readlines())
    with open(ids_fn_sem, "r") as fo:
        ids_sem = map(lambda x: x.strip('\n'), fo.readlines())
    # with open(os.path.join(ddir.flickr8k_dir, "word_ids/captions_dict.pkl"),'rb') as f:
    #     captions_dict = pkl.load(f)
    predKwList = []
    value_bow, gtKwDict, captionsDict = data_io.get_semValues(
        ddir.labels_csv, ddir.keywords_test)
    count_bow = data_io.get_semCounts(ddir.counts_csv, ddir.keywords_test)
    pred_multi, pred_multiBoW = np.zeros([len(ids_sem),
                                          len(mapping)]), np.zeros(
                                              [len(ids_sem),
                                               len(mapping)])
    vis_multi = np.zeros([len(ids_sem), len(mapping)])
    for i in range(len(ids)):
        # caption = ' '.join(captions_dict[ids[i]])
        idx = [ids[i]]
        z = idx[0].split("_")
        del z[0]
        idxnew = "_".join(z)
        if (idxnew in ids_sem):
            Xs, _ = data_io.load_mfcc(ddir.mfcc_dir, idx, args.n_pad)
            Xs = np.transpose(Xs, (0, 2, 1))
            Ys = np.stack(map(lambda x: vision_bow_vec[x[4:-2]], idx), axis=0)
            visMapped = Ys[0][mapping]
            if (args.mt):
                pred, predBoW = getKWprob(Xs)
                predBoWMapped = predBoW[0][mapping]
            else:
                pred = getKWprob(Xs)
            predMapped = pred[0][mapping]
            pred_multi[ids_sem.index(idxnew)] = predMapped
            pred_multiBoW[ids_sem.index(idxnew)] = predBoWMapped
            vis_multi[ids_sem.index(idxnew)] = visMapped

    eer, ap, spearman, prec10, precN = utils.get_metrics(
        pred_multiBoW, value_bow, count_bow)
    pcont = "Subjective ratings: EER: %f, Average precision: %f, Precision@10: %f, Precision@N: %f, Spearman's rho: %f" % (
        eer, ap, prec10, precN, spearman)
    print(pcont)
    with open("aux_sem.csv", "a+") as fo:
        fo.write(args.mtType + ',' + str(args.alpha) + ',' + str(args.n_bow2) +
                 ',' + str(spearman) + ',' + str(prec10 * 100) + ',' +
                 str(precN * 100) + ',' + str(eer * 100) + ',' +
                 str(ap * 100) + '\n')
def run_repetitions(data, target, clf, names, reps, train_pct=0.8):
    """
    Run repeated experiments on random train test splits of the data
    :param data: an iterable of numpy arrays
    :param target: a numpy array of target variables
    :param clf: a scikit-learn classifier
    :param names: the names of the data sets. Size should match data
    :param reps: the number of repetitions to run for each dataset
    :param train_pct: the percentage of the data to use for training. The rest will be held out for the test set.
    :return:
    """
    results = np.zeros(shape=(len(data), reps))
    for rep in range(reps):
        msk = np.random.rand(len(target)) < train_pct
        y_train = target[msk]
        y_test = target[~msk]
        for idx, dataset in enumerate(data):
            X_train = dataset[msk, :]
            X_test = dataset[~msk, :]
            clf.fit(X_train, y_train)
            probs = clf.predict_proba(X_test)
            res = utils.get_metrics(y_test, probs)[0]
            print 'rep{0} '.format(idx), res
            results[idx, rep] = res
    train = []
    mean = results.mean(axis=1)
    for idx, dataset in enumerate(data):
        clf.fit(dataset, target)
        probs = clf.predict_proba(dataset)
        res = utils.get_metrics(target, probs)[0]
        train.append(res)

    df = pd.DataFrame(data=results, index=names)
    df['mean'] = mean
    df['train'] = train

    return df
Esempio n. 27
0
def test_accuracy(x,
                  y,
                  testx,
                  testy,
                  b,
                  r,
                  hashing_type='hamming',
                  bucket_width=None):
    bands = init_bands(x,
                       y,
                       b,
                       r,
                       hashing_type=hashing_type,
                       bucket_width=bucket_width)
    missed_points = 0
    py = []
    for q in testx:
        res = classify(q, y, bands)
        if not res:
            if missed_points == 0:
                print('Warning: Some of the points might get missed because '
                      'their hash doesn\'t match with hash of any other '
                      'points in training data.')
            missed_points += 1
            py.append(-10)
            continue
        py.append(res)

    if missed_points > 0:
        print('Total %d points were missed during classification' %
              (missed_points))
        indices = np.where(np.array(py) != -10)
        return utils.get_metrics(
            np.array(testy)[indices],
            np.array(py)[indices])

    return utils.get_metrics(testy, py)
Esempio n. 28
0
def evaluate(epoch, val_loader, model, loss_fn, log_writer=None):
    model.eval()
    avg_loss = 0.0
    avg_preci = 0.0
    avg_recall = 0.0
    all_labels = []
    all_preds = []
    for batch_id, data in enumerate(val_loader()):
        xd, yd = data
        xd = xd.unsqueeze((1))
        label = yd
        logits = model(xd)
        loss_val = loss_fn(logits, label)

        pred = F.softmax(logits)
        all_labels += [label.numpy()]
        all_preds += [pred.numpy()]

        preci, recall = get_metrics(label, pred)
        avg_preci = (avg_preci * batch_id + preci) / (1 + batch_id)
        avg_recall = (avg_recall * batch_id + recall) / (1 + batch_id)
        avg_loss = (avg_loss * batch_id + loss_val.numpy()[0]) / (1 + batch_id)

        msg = f'eval epoch:{epoch}, batch:{batch_id}'
        msg += f'|{len(val_loader)}'
        msg += f',loss:{avg_loss:.3}'
        msg += f',recall:{avg_recall:.3}'
        msg += f',preci:{avg_preci:.3}'
        avg_preci = (avg_preci * batch_id + preci) / (1 + batch_id)
        avg_recall = (avg_recall * batch_id + recall) / (1 + batch_id)
        if batch_id % 20 == 0:
            logger.info(msg)
            if log_writer is not None:
                log_writer.add_scalar(tag="eval loss",
                                      step=batch_id,
                                      value=avg_loss)
                log_writer.add_scalar(tag="eval preci",
                                      step=batch_id,
                                      value=avg_preci)
                log_writer.add_scalar(tag="eval recall",
                                      step=batch_id,
                                      value=avg_recall)

    all_preds = np.concatenate(all_preds, 0)
    all_labels = np.concatenate(all_labels, 0)
    mAP_scores = average_precision_score(all_labels, all_preds, average=None)

    return avg_loss, avg_preci, avg_recall, mAP_scores
Esempio n. 29
0
    def train(self, x1, x2, x3):

        """
        The triplet loss is calculated by and is minimized using Optimizer.Returns the metrics and optimizer operation.
        :param x1:
        :param x2:
        :param x3:
        :return: train_op, summary_op, metrics_update_op, loss_op
        """
        with tf.name_scope("train"):
            loss_op = self.triplet_loss(x1, x2, x3)
            train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss_op) 

            summary_op, mean_loss_op, metrics_update_op = utils.get_metrics(loss_op)

            tf.summary.scalar('loss', loss_op)

        return train_op, summary_op, metrics_update_op, loss_op, mean_loss_op
Esempio n. 30
0
        def collect_metric(counter):
            counter += 1
            timer = threading.Timer(_period, collect_metric, (counter,))
            

            timer.start()
            db = database(self.db_info["host"],
                    self.db_info["port"],self.db_info["user"],
                    self.db_info["password"],
                    "sbtest",
                    )
            if counter >= count:
                timer.cancel()
            try:
                data = utils.get_metrics(db)
                internal_metrics.append(data)
            except Exception as err:
                logger.info("[GET Metrics]Exception:" ,err) 
Esempio n. 31
0
def train_save_evaluate(
    params, kb, train_set, dev_set, ind2emoji, embeddings_array, dataset_name
):

    # If the minibatch is larger than the number of emojis we have, we can't fill train/test batches
    if params.mb > len(ind2emoji):
        print(
            str.format(
                "Skipping: k={}, batch={}, epochs={}, ratio={}, dropout={}",
                params.out_dim,
                params.pos_ex,
                params.max_epochs,
                params.neg_ratio,
                params.dropout,
            )
        )
        print("Can't have an mb > len(ind2emoji)")
        return "N/A"
    else:
        print(
            str.format(
                "Training: k={}, batch={}, epochs={}, ratio={}, dropout={}",
                params.out_dim,
                params.pos_ex,
                params.max_epochs,
                params.neg_ratio,
                params.dropout,
            )
        )

    model_folder = params.model_folder(dataset_name=dataset_name)
    model_path = model_folder + "/model.pt"

    dsets = {"train": train_set, "dev": dev_set}
    predictions = dict()
    results = dict()

    if os.path.exists(model_path):
        predictions = pk.load(open(model_folder + "/results.p", "rb"))

    else:

        model = Emoji2Vec(
            model_params=params,
            num_emojis=kb.dim_size(0),
            embeddings_array=embeddings_array,
        )
        model.train(
            kb=kb, epochs=params.max_epochs, learning_rate=params.learning_rate
        )
        os.makedirs(model_folder)
        torch.save(model.nn, model_folder + "/model.pt")
        e2v = model.create_gensim_files(
            model_folder=model_folder,
            ind2emoj=ind2emoji,
            out_dim=params.out_dim,
        )
        if params.in_dim != params.out_dim:
            embeddings_array = model.nn.project_embeddings(embeddings_array)
        for dset_name in dsets:
            _, pred_values, _, true_values = generate_predictions(
                e2v=e2v,
                dset=dsets[dset_name],
                phr_embeddings=embeddings_array,
                ind2emoji=ind2emoji,
                threshold=params.class_threshold,
            )
            predictions[dset_name] = {
                "y_true": true_values,
                "y_pred": pred_values,
            }

        pk.dump(predictions, open(model_folder + "/results.p", "wb"))

    for dset_name in dsets:
        true_labels = [bool(x) for x in predictions[dset_name]["y_true"]]
        pred_labels = [
            x >= params.class_threshold
            for x in predictions[dset_name]["y_pred"]
        ]
        true_values = predictions[dset_name]["y_true"]
        pred_values = predictions[dset_name]["y_pred"]
        # Calculate metrics
        acc, f1, auc = get_metrics(
            pred_labels, pred_values, true_labels, true_values
        )
        print(
            str.format(
                "{}: Accuracy(>{}): {}, f1: {}, auc: {}",
                dset_name,
                params.class_threshold,
                acc,
                f1,
                auc,
            )
        )
        results[dset_name] = {"accuracy": acc, "f1": f1, "auc": auc}

    return results["dev"]