Ejemplo n.º 1
0
    def fit(self):
        x1, x2 = self.__resampeling()
        try:
            err1 = f1(
                self.Y1,
                cross_val_predict(self.model1_init, self.X1, self.Y1, cv=10))
        except:
            err1 = 0
        try:
            err2 = f1(
                self.Y2,
                cross_val_predict(self.model2_init, self.X2, self.Y2, cv=10))
        except:
            err2 = 0
        self.err.append([err1, err2])
        try:
            self.model1_init.fit(self.X1, self.Y1)
        except:
            self.model1_init.fit(self.X1)
        try:
            self.model2_init.fit(self.X2, self.Y2)
        except:
            self.model2_init.fit(self.X2)

        y1 = self.model1_init.predict(x1)
        if all(np.unique(y1 == [-1., 1.])):
            y1[y1 == -1.] = 1
            y1[y1 == 1.] = 0
        y2 = self.model2_init.predict(x2)
        if all(np.unique(y2 == [-1., 1.])):
            y2[y2 == -1.] = 1
            y2[y2 == 1.] = 0

        self.X1 = self.X1.append(x2).reset_index(drop=True)
        self.X2 = self.X2.append(x1).reset_index(drop=True)
        self.Y1 = self.Y1.append(pd.Series(y2)).reset_index(drop=True)
        self.Y2 = self.Y2.append(pd.Series(y1)).reset_index(drop=True)

        for i in range(self.n_iter - 1):
            x1, x2 = self.__resampeling()

            err1 = f1(self.Y1,
                      cross_val_predict(self.model1, self.X1, self.Y1, cv=10))
            err2 = f1(self.Y2,
                      cross_val_predict(self.model2, self.X2, self.Y2, cv=10))
            self.err.append([err1, err2])

            self.model1.fit(self.X1, self.Y1)
            self.model2.fit(self.X2, self.Y2)

            y1 = self.model1.predict(x1)
            y2 = self.model2.predict(x2)

            self.X1 = self.X1.append(x2).reset_index(drop=True)
            self.X2 = self.X2.append(x1).reset_index(drop=True)
            self.Y1 = self.Y1.append(pd.Series(y2)).reset_index(drop=True)
            self.Y2 = self.Y2.append(pd.Series(y1)).reset_index(drop=True)
def runSpacy(file, out):
    nlp = spacy.load("en_core_web_md")

    df = baseDF(file)
    xx = df["clean_text"]
    yy = df["userid"]
    xx, yy = shuffle(xx, yy)

    trainX, testX, trainY, testY = train_test_split(xx, yy, test_size=0.2)

    start = time.time()

    docs = {}
    for user in tqdm(trainY.unique().tolist()):
        tweets = trainX.loc[trainY == user].tolist()
        tweetDoc = ". ".join(tweets).replace("..", ".")
        docs[user] = nlp(tweetDoc)

    def getSimilarity(base, target):
        return docs[target].similarity(base)

    def getConfidence(row):
        if row["Best"] == 0:
            return 0
        diff = (row["Best"] - row["Second"]) / row["Best"]
        return diff

    evaluate = pd.concat([testX, testY], axis=1)
    evaluate["DOC"] = evaluate["clean_text"].progress_apply(lambda x: nlp(x))

    for col in tqdm(trainY.unique()):
        evaluate[col] = evaluate["DOC"].apply(getSimilarity, target=col)
    evaluate.drop(["DOC"], axis=1, inplace=True)
    evaluate["Guess"] = evaluate.loc[:, trainY.unique()].idxmax(axis=1)

    #Unweighted
    score = f1(evaluate["userid"], evaluate["Guess"], average='weighted')
    checkPrint("Unweighted F1: {}".format(score), out)
    mid = time.time()
    checkPrint("Time Taken: {} seconds".format(mid - start), out)

    #Weighted
    evaluate["Best"] = evaluate.loc[:, trainY.unique()].max(axis=1)
    evaluate["Second"] = evaluate.loc[:, trainY.unique()].apply(
        lambda row: row.nlargest(2).values[-1], axis=1)
    evaluate["Confidence"] = evaluate.apply(getConfidence, axis=1)
    score = f1(evaluate["userid"],
               evaluate["Guess"],
               average='weighted',
               sample_weight=evaluate["Confidence"])
    checkPrint("Confidence Weighted F1: {}".format(score), out)

    end = time.time()
    checkPrint("Time Taken: {} seconds".format(end - start), out)
Ejemplo n.º 3
0
def acc(loader):
    accuracy = 0
    num_batches = 0
    act = np.array([])
    pred = np.array([])
    for batch in loader:
        gpu = batch.question_text.to(device).long()
        preds = bid_lstm_cnn(gpu)
        target = batch.target.numpy()
        preds = preds.cpu().detach().numpy()
        preds = np.array([np.argmax(row) for row in preds])
        total_correct = sum(target == preds)

        act = np.concatenate((act, target))
        pred = np.concatenate((pred, preds))

        accuracy += total_correct
        num_batches += 1
    ass = accuracy / (num_batches * batch_size)
    print(ass)
    formula1 = f1(act, pred)
    print(formula1)
    tn, fp, fn, tp = cm(act, pred).ravel()
    print(
        'True positives -> {}\nFalse positives -> {}\nTrue negatives -> {}\nFalse negatives -> {}\n'
        .format(tp, fp, tn, fn))
    return ass, formula1
Ejemplo n.º 4
0
def cm_f1_test(model, test_data, test_labels):

    test_pred = model.predict(test_data)
    scores = f1(test_labels, test_pred, average=None)
    argSort = scores.argsort()
    scores = scores[argSort]
    return cm(test_labels, test_pred), (argSort[:2], scores[:2])
Ejemplo n.º 5
0
 def compute(ground_truth, predictiveDistribution):
     class_predictions = np.round(predictiveDistribution.get_all_means())
     # count number of classes occuring in ground truth
     classes_present = len(np.unique(ground_truth))
     score = f1(ground_truth, class_predictions, average=None)
     score = np.sum(score) / classes_present
     return score
Ejemplo n.º 6
0
def get_metrics(prediction, y_test):
    '''
	Computes accuracy, precision, recall, ROC-AUC and F1 metrics for 
	consideroing predictions produced by a ML and actual values of a 
	dependent variables.
	Inputs:
		- prediction: an array with predictions.
		- y_test: an array with actual values.
	Returns a dictionary with metrics of a ML model.
	'''
    Accuracy = accuracy(prediction, y_test)
    Precision = precision(prediction, y_test)
    Recall = recall(prediction, y_test)
    try:
        AUC = roc_auc(prediction, y_test)
    except ValueError:
        AUC = 0
    F1 = f1(prediction, y_test)

    metrics_dict = {
        'Accuracy': Accuracy,
        'Precision': Precision,
        'Recall': Recall,
        'AUC': AUC,
        'F1': F1
    }
    return metrics_dict
Ejemplo n.º 7
0
def evaluate(model,
             iterator_function,
             _batch_count,
             cuda_device,
             output_buffer=sys.stderr):
    if output_buffer is not None:
        print(_batch_count, file=output_buffer)
    model.eval()
    with torch.no_grad():
        predictions = []
        expectations = []
        batch_generator = range(_batch_count)
        if output_buffer is not None:
            batch_generator = tqdm(batch_generator)
        for _ in batch_generator:
            features, targets = iterator_function()
            if cuda_device != -1:
                features = features.cuda(device=cuda_device)
            probs, _, _ = model(example_batch=features)
            batch_pred = np.argmax(probs.detach().cpu().numpy(),
                                   axis=-1).tolist()
            batch_tgt = targets.detach().cpu().numpy().tolist()
            predictions.extend(batch_pred)
            expectations.extend(batch_tgt)
        model.train()
        return acc(expectations, predictions) * 100, \
               pr(expectations, predictions) * 100, \
               rc(expectations, predictions) * 100, \
               f1(expectations, predictions) * 100,
Ejemplo n.º 8
0
def evaluate(encoder, loc='./'):

    print('Preparing data...')
    traintext, testtext, labels = load_data(loc)

    print('Computing training skipthoughts...')
    trainA = encoder.encode(traintext[0])
    trainB = encoder.encode(traintext[1])

    C = 4

    print('Computing testing skipthoughts...')
    testA = encoder.encode(testtext[0])
    testB = encoder.encode(testtext[1])

    train_features = np.c_[np.abs(trainA - trainB), trainA * trainB,
                           feats(traintext[0], traintext[1])]
    test_features = np.c_[np.abs(testA - testB), testA * testB,
                          feats(testtext[0], testtext[1])]

    print('Evaluating...')
    clf = LogisticRegression(C=C)
    clf.fit(train_features, labels[0])
    yhat = clf.predict(test_features)

    print('Test accuracy: ', str(clf.score(test_features, labels[1])))
    print('Test F1: ', str(f1(labels[1], yhat)))
Ejemplo n.º 9
0
def present_results_simp(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1'
    ]
    return df
Ejemplo n.º 10
0
def evaluate(encoder,
             k=10,
             seed=1234,
             evalcv=True,
             evaltest=False,
             use_feats=True,
             loc='./data/'):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print('Preparing data...')
    traintext, testtext, labels = load_data(loc)

    print('Computing training skipthoughts...')
    trainA = encoder.encode(traintext[0])
    trainB = encoder.encode(traintext[1])

    if evalcv:
        print('Running cross-validation...')
        C = eval_kfold(trainA,
                       trainB,
                       traintext,
                       labels[0],
                       shuffle=True,
                       k=10,
                       seed=1234,
                       use_feats=use_feats)

    if evaltest:
        if not evalcv:
            C = 4  # Best parameter found from CV (combine-skip with use_feats=True)

        print('Computing testing skipthoughts...')
        testA = encoder.encode(testtext[0])
        testB = encoder.encode(testtext[1])

        if use_feats:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB,
                                   feats(traintext[0], traintext[1])]
            test_features = np.c_[np.abs(testA - testB), testA * testB,
                                  feats(testtext[0], testtext[1])]
        else:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
            test_features = np.c_[np.abs(testA - testB), testA * testB]

        print('Evaluating...')
        clf = LogisticRegression(C=C)
        clf.fit(train_features, labels[0])
        yhat = clf.predict(test_features)
        print('Test accuracy: ' + str(clf.score(test_features, labels[1])))
        print('Test F1: ' + str(f1(labels[1], yhat)))
        vis_data = TSNE(n_components=2).fit_transform(train_features)
        vis_x = vis_data[:, 0]
        vis_y = vis_data[:, 1]
        plt.scatter(vis_x, vis_y,
                    c=labels[0])  #, cmap=plt.cm.get_cmap('jet',2))
        plt.savefig('tsne_msrp.png')
Ejemplo n.º 11
0
def evaluatesMLPredictions(y_true, y_pred):

    esal = sal(y_true, y_pred)
    ehl = hl(y_true, y_pred)
    ma = 1 - f1(y_true, y_pred, average='macro')
    mi = 1 - f1(y_true, y_pred, average='micro')
    if1 = 1 - instanceF1(y_true, y_pred)
    eji = 1 - ji(y_true, y_pred)
    mapre = 1 - precision_score(y_true, y_pred, average='macro')
    marec = 1 - recall_score(y_true, y_pred, average='macro')
    mipre = 1 - precision_score(y_true, y_pred, average='micro')
    mirec = 1 - recall_score(y_true, y_pred, average='micro')

    # probability metrics
    cov = coverage_error(y_true, y_pred)
    erl = rl(y_true, y_pred)

    return esal, ehl, ma, mi, if1, eji, mapre, marec, mipre, mirec, cov, erl
Ejemplo n.º 12
0
def build_classifier_and_test(train_X,
                              train_y,
                              test_X,
                              test_y,
                              clf,
                              print_train_result=True):
    clf.fit(train_X, train_y)
    if print_train_result == True:
        p_tr = clf.predict(train_X)
        print("Train Accuracy:\t", acc(train_y, p_tr))
        print("Train Precision:\t", pr(train_y, p_tr))
        print("Train Recall_score:\t", rc(train_y, p_tr))
        print("Train F-score:\t", f1(train_y, p_tr))
    predicted = clf.predict(test_X)
    print("Accuracy:\t", acc(test_y, predicted))
    print("Precision:\t", pr(test_y, predicted))
    print("Recall_score:\t", rc(test_y, predicted))
    print("F-score:\t", f1(test_y, predicted))
Ejemplo n.º 13
0
 def update_metrics(gt, pre, f1_m, p_m, r_m, acc_m):
     f1_value = f1(gt, pre, average="micro")
     f1_m.update(f1_value)
     p_value = precision(gt, pre, average="micro", zero_division=0)
     p_m.update(p_value)
     r_value = recall(gt, pre, average="micro")
     r_m.update(r_value)
     acc_value = accuracy(gt, pre)
     acc_m.update(acc_value)
Ejemplo n.º 14
0
def instanceF1(y_true, y_pred):
    """
    y_true : 2d array-like, of size n x q
    y_pred : 2d array-like, of size n x q
    """
    n, q = y_true.shape
    if1 = 0
    for i in np.arange(n):
        if1 = if1 + f1(y_true[i, :], y_pred[i, :])
    return if1 / n
Ejemplo n.º 15
0
def clone_analysis(data_paths):
    code = []
    labels = []
    positives = 0
    for file_name in data_paths:
        data = json.load(open(file_name))
        for example in data:
            code.append(example['tokenized'])
            l = 0
            if 'label' in example.keys():
                l = int(example['label'])
            elif 'lebel' in example.keys():
                l = int(example['lebel'])
            elif 'leble' in example.keys():
                l = int(example['leble'])
            elif 'lable' in example.keys():
                l = int(example['lable'])
            if l > 1:
                l = 1
            positives += l
            labels.append(l)
    print(len(code), len(labels), positives, len(labels) - positives)
    vectorizer = TfidfVectorizer(input=code,
                                 lowercase=False,
                                 ngram_range=(1, 3))
    X = vectorizer.fit_transform(code)
    model = KMeans(n_clusters=10, max_iter=100)
    model.fit(X)
    y = model.predict(X)
    cluster_to_positive = [0] * 10
    cluster_to_negative = [0] * 10
    for pred, label in zip(y, labels):
        if label == 1:
            cluster_to_positive[pred] += 1
        else:
            cluster_to_negative[pred] += 1
    print(cluster_to_positive)
    print(cluster_to_negative)
    percentages = [
        float(p) / (p + n)
        for p, n in zip(cluster_to_positive, cluster_to_negative)
    ]
    for p in percentages:
        print(p)
    for _ in range(5):
        XTrain, XTest, YTrain, YTest = train_test_split(X,
                                                        labels,
                                                        test_size=0.2)
        model = RandomForestClassifier()
        model.fit(XTrain, YTrain)
        predicted = model.predict(XTest)
        print('%.3f\t%.3f\t%.3f\t%.3f' %
              (acc(YTest, predicted) * 100, pr(YTest, predicted) * 100,
               rc(YTest, predicted) * 100, f1(YTest, predicted) * 100))
    pass
Ejemplo n.º 16
0
def eval_kfold(A,
               B,
               train,
               labels,
               shuffle=True,
               k=10,
               seed=1234,
               use_feats=False):
    """
    Perform k-fold cross validation
    """
    # features
    labels = np.array(labels)
    if use_feats:
        features = np.c_[np.abs(A - B), A * B, feats(train[0], train[1])]
    else:
        features = np.c_[np.abs(A - B), A * B]

    scan = [2**t for t in range(0, 9, 1)]
    npts = len(features)
    kf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    scores = []

    for s in scan:

        scanscores = []

        for train, test in kf.split(features, labels):

            # Split data
            X_train = features[train]
            y_train = labels[train]
            X_test = features[test]
            y_test = labels[test]

            # Train classifier
            clf = LogisticRegression(C=s)
            clf.fit(X_train, y_train)
            yhat = clf.predict(X_test)
            fscore = f1(y_test, yhat)
            scanscores.append(fscore)
            print((s, fscore))

        # Append mean score
        scores.append(np.mean(scanscores))
        print(scores)

    # Get the index of the best score
    s_ind = np.argmax(scores)
    s = scan[s_ind]
    print(scores)
    print(s)
    return s
Ejemplo n.º 17
0
def evaluate(encoder,
             k=10,
             seed=1234,
             evalcv=True,
             evaltest=False,
             use_feats=True,
             loc='./data/'):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print('Preparing data...')
    traintext, testtext, labels = load_data(loc)

    print('Computing training skipthoughts...')
    trainA = encoder.encode(traintext[0], verbose=False)
    trainB = encoder.encode(traintext[1], verbose=False)

    if evalcv:
        print('Running cross-validation...')
        C = eval_kfold(trainA,
                       trainB,
                       traintext,
                       labels[0],
                       shuffle=True,
                       k=10,
                       seed=1234,
                       use_feats=use_feats)

    if evaltest:
        if not evalcv:
            C = 4  # Best parameter found from CV (combine-skip with use_feats=True)

        print('Computing testing skipthoughts...')
        testA = encoder.encode(testtext[0], verbose=False)
        testB = encoder.encode(testtext[1], verbose=False)

        if use_feats:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB,
                                   feats(traintext[0], traintext[1])]
            test_features = np.c_[np.abs(testA - testB), testA * testB,
                                  feats(testtext[0], testtext[1])]
        else:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
            test_features = np.c_[np.abs(testA - testB), testA * testB]

        print('Evaluating...')
        clf = LogisticRegression(C=C)
        clf.fit(train_features, labels[0])
        yhat = clf.predict(test_features)
        print('Test accuracy: ' + str(clf.score(test_features, labels[1])))
        print('Test F1: ' + str(f1(labels[1], yhat)))
Ejemplo n.º 18
0
def evaluate(encoder,
             k=10,
             seed=1234,
             evalcv=False,
             evaltest=True,
             use_feats=True):
    """
  Run experiment
  k: number of CV folds
  test: whether to evaluate on test set
  """
    traintext, testtext, labels = load_data()

    trainA = encoder.encode(traintext[0], verbose=False, norm=True)
    trainB = encoder.encode(traintext[1], verbose=False, norm=True)

    if evalcv:
        print 'Running cross-validation...'
        C = eval_kfold(trainA,
                       trainB,
                       traintext,
                       labels[0],
                       shuffle=True,
                       k=k,
                       seed=seed,
                       use_feats=use_feats)
    else:
        C = 4

    if evaltest:
        print 'Computing testing skipthoughts...'
        testA = encoder.encode(testtext[0], verbose=False, norm=True)
        testB = encoder.encode(testtext[1], verbose=False, norm=True)

        if use_feats:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB,
                                   feats(traintext[0], traintext[1])]
            test_features = np.c_[np.abs(testA - testB), testA * testB,
                                  feats(testtext[0], testtext[1])]
        else:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
            test_features = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        clf = LogisticRegression(C=C)
        clf.fit(train_features, labels[0])
        yhat = clf.predict(test_features)
        acc = clf.score(test_features, labels[1])
        f1_score = f1(labels[1], yhat)
        print 'Test accuracy: ' + str(acc)
        print 'Test F1: ' + str(f1_score)
        return acc, f1_score
Ejemplo n.º 19
0
    def on_epoch_end(self, epoch, logs={}):
        label_true = []
        label_pred = []
        for i in range(len(self.seq)):
            x_true, y_true = self.seq[i]
            lengths = self.get_lengths(y_true)
            y_pred = self.model.predict_on_batch(x_true)

            y_true = self.p.inverse_transform(y_true, lengths)
            y_pred = self.p.inverse_transform(y_pred, lengths)

            label_true.extend(y_true)
            label_pred.extend(y_pred)

        score = f1_score(label_true, label_pred)
        print(' - f1: {:04.2f}'.format(score * 100))

        print(classification_report(label_true, label_pred))

        label_true = [item for sublist in label_true for item in sublist]
        label_pred = [item for sublist in label_pred for item in sublist]
        classes = np.unique(label_true)
        # classes2 = np.unique(label_pred)
        #
        # print('Classes: ', classes, classes2)

        bacc = balanced_accuracy_score(label_true, label_pred)
        print(' - BACC: {:04.2f}'.format(bacc * 100))

        score = f1(label_true, label_pred, average='micro')
        print(' - f1: {:04.2f}'.format(score * 100))

        print(clsrep(label_true, label_pred, labels=classes))
        self.history.append(bacc)
        self.model_checkpoint(bacc)
        # self.reduce_lr_on_plateau()
        # self.early_stopping_check()
        logs['f1'] = score

        if epoch == self.swa_epoch:
            self.swa_weights = self.model.get_weights()

        if epoch > self.swa_epoch and bacc > self.best_bacc:
            self.swa_control += 1
            for i in range(len(self.swa_weights)):
                self.swa_weights[i] = (self.swa_weights[i] * self.swa_control +
                                       self.model.get_weights()[i]) / (
                                           self.swa_control + 1)

        else:
            pass
Ejemplo n.º 20
0
    def on_train_end(self, epoch, logs={}):
        x, y = next(self.val_gen)
        preds = self.model.predict(x, batch_size=self.batch_size)

        y = self.flat_and_binary(y)
        preds = self.flat_and_binary(preds)
        jac = jaccard(y, preds)
        dice = f1(y, preds, average='micro')

        logs['dice'] = dice
        logs['jac'] = jac
        logs['fold'] = self.fold

        print('\nTesting jac: {}, dice: {}\n'.format(jac, dice))
Ejemplo n.º 21
0
def computesMetric(y_true, y_pred, metric='HL'):

    if metric == 'HL':
        r = hl(y_true, y_pred)
    elif metric == 'SA':
        r = sal(y_true, y_pred)
    elif metric == 'Ma':
        r = 1 - f1(y_true, y_pred, average='macro')
    elif metric == 'Mi':
        r = 1 - f1(y_true, y_pred, average='micro')
    elif metric == 'IF1':
        r = instanceF1(y_true, y_pred)
    elif metric == 'IJ':
        r = 1 - ji(y_true, y_pred)
    elif metric == 'MaP':
        r = 1 - precision_score(y_true, y_pred, average='macro')
    elif metric == 'MiP':
        r = 1 - precision_score(y_true, y_pred, average='micro')
    elif metric == 'MaR':
        r = 1 - recall_score(y_true, y_pred, average='macro')
    elif metric == 'MiR':
        r = 1 - recall_score(y_true, y_pred, average='micro')
    return r
Ejemplo n.º 22
0
 def validate(self, loader, model):
     act = np.array([])
     pred = np.array([])
     for batch in loader:
         gpu = batch.question_text.to(self.device).long()
         preds = model(gpu)
         target = batch.target.numpy()
         preds = preds.cpu().detach().numpy()
         preds = np.array([np.argmax(row) for row in preds])
         act = np.concatenate((act, target))
         pred = np.concatenate((pred, preds))
     formula1 = f1(act, pred)
     print(model.ID, 'val f1 ->', formula1)
     return formula1
Ejemplo n.º 23
0
def reportStats(weight, current_iteration, X_train, y_train, X_test, y_test):

    y_train[y_train < 0] = 0
    y_test[y_test < 0] = 0

    ypred_is = predict_all(X_train, weight)
    ypred_oos = predict_all(X_test, weight)

    np_err_handling = np.seterr(invalid='ignore')

    is_acc = acc(y_train, ypred_is)
    is_mcc = mcc(y_train, ypred_is)
    is_f1 = f1(y_train, ypred_is)
    is_mse = mse(y_train, ypred_is)

    oos_acc = acc(y_test, ypred_oos)
    oos_mcc = mcc(y_test, ypred_oos)
    oos_f1 = f1(y_test, ypred_oos)
    oos_mse = mse(y_test, ypred_oos)

    is_tn, is_fp, is_fn, is_tp = confusion_matrix(y_train, ypred_is).ravel()
    oos_tn, oos_fp, oos_fn, oos_tp = confusion_matrix(y_test,
                                                      ypred_oos).ravel()
    is_auprc = auprc(y_train, ypred_is)
    oos_auprc = auprc(y_test, ypred_oos)

    np.seterr(**np_err_handling)

    print(
        f"Consensus {current_iteration}: IS acc {is_acc:0.5f}.  IS MCC {is_mcc:0.5f}.  IS F1 {is_f1:0.5f}.  IS MSE {is_mse:0.5f}.  OOS acc {oos_acc:0.5f}.  OOS MCC {oos_mcc:0.5f}.  OOS F1 {oos_f1:0.5f}.  OOS MSE {oos_mse:0.5f}."
    )
    print(
        f"Confusion {current_iteration}: IS TP: {is_tp}, IS FP: {is_fp}, IS TN: {is_tn}, IS FN: {is_fn}, IS AUPRC: {is_auprc:0.5f}.  OOS TP: {oos_tp}, OOS FP: {oos_fp}, OOS TN: {oos_tn}, OOS FN: {oos_fn}, OOS AUPRC: {oos_auprc:0.5f}."
    )

    return is_acc, is_mcc, is_f1, is_mse, is_auprc, oos_acc, oos_mcc, oos_f1, oos_mse, oos_auprc
Ejemplo n.º 24
0
def best_f1(y_true, y_score):
    best_f1_score = -1000
    sorted_scores = sorted(y_score)
    for threshold in sorted_scores:
        temp_y_predict = []
        for actual_score in y_score:
            if actual_score < threshold:
                temp_y_predict.append(1)
            else:
                temp_y_predict.append(0)
        now_f1 = f1(y_true,temp_y_predict)

        if (now_f1>best_f1_score):
            best_f1_score = now_f1


    return best_f1_score
Ejemplo n.º 25
0
def eval_kfold(A, B, train, labels, shuffle=True, k=10, seed=1234, use_feats=False):
    """
    Perform k-fold cross validation
    """
    # features
    labels = np.array(labels)
    if use_feats:
        features = np.c_[np.abs(A - B), A * B, feats(train[0], train[1])]
    else:
        features = np.c_[np.abs(A - B), A * B]

    scan = [2**t for t in range(0,9,1)]
    npts = len(features)
    kf = KFold(npts, n_folds=k, shuffle=shuffle, random_state=seed)
    scores = []

    for s in scan:

        scanscores = []

        for train, test in kf:

            # Split data
            X_train = features[train]
            y_train = labels[train]
            X_test = features[test]
            y_test = labels[test]

            # Train classifier
            clf = LogisticRegression(C=s)
            clf.fit(X_train, y_train)
            yhat = clf.predict(X_test)
            fscore = f1(y_test, yhat)
            scanscores.append(fscore)
            print (s, fscore)

        # Append mean score
        scores.append(np.mean(scanscores))
        print scores

    # Get the index of the best score
    s_ind = np.argmax(scores)
    s = scan[s_ind]
    print scores
    print s
    return s
Ejemplo n.º 26
0
def evaluate(encoder,
             k=10,
             seed=3456,
             evalcv=True,
             evaltest=False,
             loc='./data/'):
    print 'Load Data...'
    traintext, testtext, labels = load_data(loc)

    print 'Convert to sentence embeddings...'

    trainA = encoder.encode(traintext[0], verbose=False)
    trainB = encoder.encode(traintext[1], verbose=False)

    if evalcv:
        print 'Perform cross-validation...'
        C = eval_kfold(trainA,
                       trainB,
                       traintext,
                       labels[0],
                       shuffle=True,
                       k=10,
                       seed=3456)
    #print("Size of sentences: ",trainA.shape)
    if evaltest:
        if not evalcv:
            C = 4

        print 'Convert test data to skipthought vectors...'
        testA = encoder.encode(testtext[0], verbose=False)
        testB = encoder.encode(testtext[1], verbose=False)

        #u.v and u-v features concatenation
        train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
        test_features = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluate logistic regression...'
        clf = LogisticRegression(C=C)
        #fit model
        clf.fit(train_features, labels[0])
        #get prediction
        ypred = clf.predict(test_features)
        print 'Test accuracy: ' + str(clf.score(test_features, labels[1]))
        #get f1 score, label 1 is true value
        print 'Test F1: ' + str(f1(labels[1], ypred))
Ejemplo n.º 27
0
def get_best_f1_threshold(y_true, y_score):
    best_f1_score = -1000
    best_f1_threshold = .0
    sorted_scores = sorted(y_score)
    for threshold in sorted_scores:
        if threshold == 1000:
            continue
        temp_y_predict = []
        for actual_score in y_score:
            if actual_score < threshold:
                temp_y_predict.append(1)
            else:
                temp_y_predict.append(0)
        now_f1 = f1(y_true,temp_y_predict)

        if (now_f1>best_f1_score):
            best_f1_score = now_f1
            best_f1_threshold = threshold


    return best_f1_threshold,best_f1_score
Ejemplo n.º 28
0
def eval_kfold(A, B, train, labels, shuffle=True, k=10, seed=3456):
    # features
    labels = np.array(labels)
    features = np.c_[np.abs(A - B), A * B]

    scan = [2**t for t in range(0, 9, 1)]
    npts = len(features)
    kf = KFold(npts, n_folds=k, shuffle=shuffle, random_state=seed)
    scores = []

    for s in scan:

        scanscores = []

        for train, test in kf:

            # Split data
            X_train = features[train]
            y_train = labels[train]
            X_test = features[test]
            y_test = labels[test]

            # Train classifier
            clf = LogisticRegression(C=s)
            clf.fit(X_train, y_train)
            yhat = clf.predict(X_test)
            fscore = f1(y_test, yhat)
            scanscores.append(fscore)
            print(s, fscore)

        # Append mean score
        scores.append(np.mean(scanscores))
        print scores

    # Get the index of the best score
    s_ind = np.argmax(scores)
    s = scan[s_ind]
    print scores
    print s
    return s
Ejemplo n.º 29
0
def test_classifiers(X,y,n=7,rname="results.txt"):        
    clfs={
#        "Bagging KNN": [BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5),[],[],[],[]],
        "NN (kNN k=1)": [KNeighborsClassifier(n_neighbors=1),[],[],[],[],[]],
        #"NN (kNN k=3)": [KNeighborsClassifier(n_neighbors=3),[],[],[],[],[]],
        "NN (kNN k=3 w)": [KNeighborsClassifier(n_neighbors=3, weights='distance'),[],[],[],[],[]],
        "NN (kNN k=5 w)": [KNeighborsClassifier(n_neighbors=5, weights='distance'),[],[],[],[],[]],
        #"NN (kNN k=7 w)": [KNeighborsClassifier(n_neighbors=7, weights='distance'),[],[],[],[]],
        #"SVM - Linear kernel": [svm.SVC(kernel="rbf",probability=True),[],[],[],[]],
 #       "Naive Bayes": [GaussianNB(),[],[],[],[]],
#        "SVM Sigmoide": [svm.SVC(kernel="sigmoid"),[],[],[],[]],
        #"ANN":[MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),[],[],[],[]],
    }
    #V=["Voting KNN",[None,[],[],[],[]]]
    skf=kfold(y, n_iter=n, random_state=None,  train_size=0.7)
    output=open(rname,"w")
    for train,test in skf:
        Xt,Yt=X[train],y[train]
        Xv,Yv=X[test],y[test]
        votes=[]
        for (k,v)  in  clfs.items():
            v[0].fit(Xt,Yt)
            #print(clfs[k])
            Yr=v[0].predict(Xv)
            #print(accs(Yv,Yr))
            v[1].append(accs(Yv,Yr))
            v[2].append(f1(Yv,Yr,average="macro"))
            v[3].append(recall(Yv,Yr,average="macro"))
            v[4].append(precision(Yv,Yr))
            v[5].append(kappa(Yv,Yr))
            #votes.append(Yr)
        #Yp=predict(votes)
    for k,v in clfs.items():
        fm="%s | %s| %s | %s | %s\n"
        output.write(fm %(k,"Accuracy",np.mean(v[1]),min(v[1]),max(v[1])))
        #output.write(fm  %(k,"Kappa",np.mean(v[5]),min(v[5]),max(v[5])))
        output.write(fm %(k,"F1",np.mean(v[2]),min(v[2]),max(v[2])))
        output.write(fm %(k,"Recall",np.mean(v[3]),min(v[3]),max(v[3])))
        output.write(fm %(k,"Precision",np.mean(v[4]),min(v[4]),max(v[4])))
Ejemplo n.º 30
0
def present_results(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        if k[:6] != 'd_tree':
            inter_list.append(roc_auc(v, y_test))
        else:
            inter_list.append('ND')
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1', 'ROC AUC'
    ]
    return df
Ejemplo n.º 31
0
def prec_rec_f1score(y_true, x_test, model):

    bce = tf.keras.losses.BinaryCrossentropy()
    y_hat = model.predict(x_test)
    y_pred = (np.greater_equal(y_hat, 0.51)).astype(int)
    pr_re_f1score_perclass = precision_recall_fscore_support(y_true,
                                                             y_pred,
                                                             average=None)
    pr_re_f1score_average = precision_recall_fscore_support(y_true,
                                                            y_pred,
                                                            average='micro')
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    accuracy = accuracy_score(y_true, y_pred)
    f1_score = f1(y_true, y_pred)
    #per class
    precision_true = pr_re_f1score_perclass[0][1]
    precision_fake = pr_re_f1score_perclass[0][0]
    recall_true = pr_re_f1score_perclass[1][1]
    recall_fake = pr_re_f1score_perclass[1][0]
    f1score_true = pr_re_f1score_perclass[2][1]
    f1score_fake = pr_re_f1score_perclass[2][0]
    metrices_name = [
        'accuracy', 'precision_true', 'precision_fake', 'recall_true',
        'recall_fake', 'f1score_true', 'f1score_fake'
    ]
    metrices_value = [
        accuracy, precision_true, precision_fake, recall_true, recall_fake,
        f1score_true, f1score_fake
    ]
    i = 0
    for item in metrices_name:
        print(item + ':', metrices_value[i])
        i += 1
    binary_loss = bce(y_true, y_hat).numpy()
    print('Binary_loss', binary_loss)

    return accuracy, precision_true, precision_fake, recall_true, recall_fake, f1score_true, f1score_fake, binary_loss
Ejemplo n.º 32
0
def evaluate(encoder, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True, loc='./data/'):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print 'Preparing data...'
    traintext, testtext, labels = load_data(loc)

    print 'Computing training skipthoughts...'
    trainA = encoder.encode(traintext[0], verbose=False)
    trainB = encoder.encode(traintext[1], verbose=False)

    if evalcv:
        print 'Running cross-validation...'
        C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats)

    if evaltest:
        if not evalcv:
            C = 4    # Best parameter found from CV (combine-skip with use_feats=True)

        print 'Computing testing skipthoughts...'
        testA = encoder.encode(testtext[0], verbose=False)
        testB = encoder.encode(testtext[1], verbose=False)

        if use_feats:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])]
            test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])]
        else:
            train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
            test_features = np.c_[np.abs(testA - testB), testA * testB]

        print 'Evaluating...'
        clf = LogisticRegression(C=C)
        clf.fit(train_features, labels[0])
        yhat = clf.predict(test_features)
        print 'Test accuracy: ' + str(clf.score(test_features, labels[1]))
        print 'Test F1: ' + str(f1(labels[1], yhat))
Ejemplo n.º 33
0
def compute_eval_stats(classifier, y_data, rankings, threshold):
    ''' Takes: classifier object, true target data, predicted score rankings, 
                ranking threshold cutoff
        Returns: accuracy, precision, recall of predictions of classifier on x for y
    '''

    predicted_test = np.where(rankings < threshold, 1, 0)

    # print(threshold)
    # print(predicted_test.sum())
    # print(predicted_test[0:10])
    # print("num unique ranks: ", pd.DataFrame(pred_scores)[0].unique().shape)
    # print("eval stats rankings are: ", rankings[0:10])

    stats = [
        accuracy(y_data, predicted_test),
        precision(y_data, predicted_test),
        recall(y_data, predicted_test),
        f1(y_data, predicted_test),
        roc(y_data, predicted_test)
    ]

    return stats
Ejemplo n.º 34
0
    def train(self,
              data,
              dev,
              verbose=True,
              opter='adam',
              lr=0.01,
              epochs=100):
        trainloader = torch.utils.data.DataLoader(data, batch_size=5000)
        criterion = nn.CrossEntropyLoss()
        # create your optimizer
        if opter == 'adam':
            optimizer = optim.Adam(self.parameters(), lr=lr)
        elif opter == 'sgd':
            optimizer = optim.SGD(self.parameters(), lr=lr)
        for epoch in range(epochs):  # loop over the dataset multiple times
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = self.forward(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
                if verbose and (i % 1 == 0):  # print every 2000 mini-batches

                    ys, y_stars = self.get_eval_data(dev)
                    print('[%d, %5d] loss: %.3f\tDev FI: %.3f' %
                          (epoch + 1, i + 1, running_loss, f1(ys, y_stars)))
                    running_loss = 0.0
Ejemplo n.º 35
0
def prec_rec_f1score(y_true, x_test, model, item):
    bce = tf.keras.metrics.BinaryCrossentropy()
    # print(model.summary() )
    y_hat = model.predict(x_test)

    y_pred = (np.greater_equal(y_hat, 0.505)).astype(int)
    # for psuedo labelling and Vat technique
    # print(item+'********')
    if item == 'Psuedo_Label':  # or item=='VAT_regularization':
        y_pred = tf.argmax(y_hat, 1).numpy()
        # y_hat= np.max(y_hat,axis=1)# this one for calculating binary loss
    # print(y_hat)
    # print(y_pred)
    pr_re_f1score_perclass = precision_recall_fscore_support(y_true,
                                                             y_pred,
                                                             average=None)
    pr_re_f1score_average = precision_recall_fscore_support(y_true,
                                                            y_pred,
                                                            average='micro')
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    accuracy = accuracy_score(y_true, y_pred)
    f1_score = f1(y_true, y_pred)
    # per class
    precision_true = pr_re_f1score_perclass[0][1]
    precision_fake = pr_re_f1score_perclass[0][0]
    recall_true = pr_re_f1score_perclass[1][1]
    recall_fake = pr_re_f1score_perclass[1][0]
    f1score_true = pr_re_f1score_perclass[2][1]
    f1score_fake = pr_re_f1score_perclass[2][0]

    fpr_, tpr_, _ = roc_curve(y_true, y_pred)
    if item == 'Psuedo_Label':  # or item == 'VAT_regularization' :
        y_true = tf.one_hot(y_true, 2).numpy()
    binary_loss = bce(y_true, y_hat).numpy()
    return accuracy, precision_true, precision_fake, recall_true, recall_fake, f1score_true, f1score_fake, binary_loss, fpr_, tpr_, y_pred
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("top_data_dir")
    parser.add_argument("--balanced","-bl",action="store_true")
    parser.add_argument('--method','-m',type=int,default=0,choices=range(6),
        help=
        """chose methods from:
                0:linear_svc
                1:logistic regression
                2:naive bayes
                3:decision  tree
                4:ExtraTreesClassifier
                5:RandomForestClassifier
        """)
    args=parser.parse_args()
    print "load data from %s" %(args.top_data_dir)
    dataset = load_data(args.top_data_dir)
    clf = get_classifier(args.method)
    num_of_split = 10
    skf = StratifiedKFold(n_splits=num_of_split,shuffle=True)
    # print dataset.X
    # print dataset.y
    f1_average = .0
    f1_macro_average = .0
    for training_index, test_index in skf.split(dataset.X, dataset.y):
        training_X = []
        training_y = []
        testing_X = []
        testing_y = []
        metrics = {}
        # print "%d training %d testing" %(len(training_index),len(test_index))
        # print training_index
        # print  test_index
        for i in training_index:
            training_X.append( dataset.X[i])
            training_y.append( dataset.y[i])

        for j in test_index:
            testing_X.append( dataset.X[j])
            testing_y.append( dataset.y[j])

        # print training_X
        # print testing_X
        clf.fit(training_X,training_y)  
        predicted_y = clf.predict(testing_X)
        # print classification_report(testing_y, predicted_y)
        f1_macro_average += f1(testing_y, predicted_y,average="macro")/(1.0*num_of_split)
        f1_average += f1(testing_y, predicted_y)/(1.0*num_of_split)


    
 



    # f1_11_macro = f1(dataset_11.y, predicted_11,average="macro")
    # f1_11 = f1(dataset_11.y, predicted_11)
    # f1_average = (f1_1516+f1_11)/2.0
    # f1_macro_average = (f1_1516_macro+f1_11_macro)/2.0

    print "Positive f1: %f" %(f1_average)
    print "Average f1: %f" %(f1_macro_average)
    print "-"*20
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--balanced","-bl",action="store_true")
    parser.add_argument('--method','-m',type=int,default=0,choices=range(6),
        help=
        """chose methods from:
                0:linear_svc
                1:logistic regression
                2:naive bayes
                3:decision  tree
                4:ExtraTreesClassifier
                5:RandomForestClassifier
        """)
    
    
    parser.add_argument("--use_result","-ur",action="store_true")
    parser.add_argument("--feature_size","-fs",default=12,type=int)
    parser.add_argument("--top_dest_dir","-td",default="/infolab/headnode2/lukuang/2016-rts/code/my_code/post_analysis/predictor_analysis/sday_prediction_data")
    parser.add_argument("--predictor_data_dir","-pd",default="/infolab/headnode2/lukuang/2016-rts/code/my_code/post_analysis/predictor_analysis/predictor_data")
    parser.add_argument("--result_expansion","-re",choices=list(map(int, Expansion)),default=0,type=int,
        help="""
            Choose the expansion:
                0:raw
                1:static:
                2:dynamic
        """)
    parser.add_argument("--retrieval_method","-rm",choices=list(map(int, RetrievalMethod)),default=0,type=int,
        help="""
            Choose the retrieval method:
                0:f2exp
                1:dirichlet
                2:pivoted
                3:bm25
        """)
    parser.add_argument("--designate_dest_dir","-dr")
    args=parser.parse_args()

    args.result_expansion = Expansion(args.result_expansion)
    args.retrieval_method = RetrievalMethod(args.retrieval_method)

    # get single term queries
    single_term_queries = {}

    for year in Year:
        
        qrel_file = QREL_FILE[year]
        judged_qids = get_judged_qid(qrel_file)
        query_dir = Q_DIR[year][args.result_expansion]
        single_term_queries[year] = get_single_term_qids(query_dir,judged_qids)

    print single_term_queries
    
    feature_descrption_list = [
        'average_idf:raw',
        'scq:raw',
        'var:raw',
        'max_pmi:raw',
        'avg_pmi:raw',
        'dev:raw',
        'ndev:raw',
        'nqc:raw',
        'wig:raw',
        'top_score:raw',
        'clarity:raw',
        'qf:raw']

    sub_feature_lists = []
    sub_feature_lists = itertools.combinations(feature_descrption_list, args.feature_size)
    # for length in  range(len(feature_descrption_list)):
    #     length += 1
    #     sub_feature_lists += itertools.combinations(feature_descrption_list, length)
    
    # best single term without result
    # sub_feature_lists = [ [ 'average_idf:raw', 'scq:raw', 'dev:raw', 'ndev:raw', 'nqc:raw', 'qf:raw' ]]

    # best single term with result
    sub_feature_lists = [ [ 'average_idf:raw', 'scq:raw', 'dev:raw' ]]

    max_average_f1 = -1
    best_sub_features = []
    for sub_feature_list in sub_feature_lists:
        data_preparor = DataPreparor(
                            args.predictor_data_dir, sub_feature_list,
                            args.use_result, args.result_expansion,args.top_dest_dir,args.retrieval_method,
                            args.designate_dest_dir)

        data_preparor.prepare_data()
        dataset_11, dataset_1516 = load_data(data_preparor._dest_dir)
        clf = get_classifier(args.method)


        

        # print "cross validation:"
        # training_predicted = cross_validation.cross_val_predict(clf,training_dataset.X,training_dataset.y,cv=5)
        # print classification_report(training_dataset.y, training_predicted)
        # print "-"*20

        # print "Test on 1516 data"
        # print "load data from %s" %(data_preparor._dest_dir)
        clf.fit(dataset_11.X,dataset_11.y)
        X_single_15, y_single_15 = prepare_single_term_query_data(dataset_1516, single_term_queries[Year.y2015],year.y2015 )
        X_single_16, y_single_16 = prepare_single_term_query_data(dataset_1516, single_term_queries[Year.y2016],year.y2016 )
        X_single_1516 = X_single_15 + X_single_16
        y_single_1516 = y_single_16 + y_single_16 
        predicted_single_1516 = clf.predict(X_single_1516)

        # print classification_report(y_single_1516, predicted_single_1516)

        # print "Test on 11 data"
        
        clf.fit(dataset_1516.X,dataset_1516.y)
        X_single_11, y_single_11 = prepare_single_term_query_data(dataset_11, single_term_queries[Year.y2011],year.y2011 )
        predicted_single_11 = clf.predict(X_single_11)
        # print y_single_11, predicted_single_11
        # print classification_report(y_single_11, predicted_single_11)

        f1_1516 = f1(y_single_1516, predicted_single_1516)
        f1_11 = f1(y_single_11,predicted_single_11)

        f1_average = (f1_1516 + f1_11)/2.0
        # print "Positive f1: %f" %(f1_average)
        if f1_average > max_average_f1:
            max_average_f1 = f1_average
            best_sub_features = sub_feature_list
        print sub_feature_list
        print f1_average

    print "-"*20
    print "Best Average F1:%f" %(max_average_f1)
    print "Best Sub Features:" 
    print best_sub_features
Ejemplo n.º 38
0
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score as f1

from BagOfWords import BagOfWords
from CatsTransformer import CatsTransformer
from CatsLister import CatsLister
from Word2VecFeature import Word2VecFeature

df = pd.read_csv("../data/original.csv")

X = df[["keyword", "category_products_and_services"]].values
y = df["labels"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=1)
missing_words = []

cl = CatsLister(column=1)
ct = CatsTransformer(column=1)
bof = BagOfWords(column=0)
w2vf = Word2VecFeature(column=0, fname="../data/model", missing_words=missing_words)
features = FeatureUnion([('ct', ct), ('bof', bof), ('w2vf', w2vf)])
clf = LogisticRegression()
pipeline = Pipeline([('cl', cl), ('features', features), ('imp', Imputer(strategy="median")), ('clf', clf)]);

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print "The f1 score is for the three classes are: %.3f, %.3f, and %.3f." % tuple(f1(y_test, y_pred, average=None))
print "Found %d missing words." % len(missing_words)
Ejemplo n.º 39
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--tree_estimator_directory","-td",default="/infolab/node4/lukuang/2015-RTS/src/my_code/post_analysis/predictor_analysis/disk4-5/predictor_data/post/tree_estimator")
    parser.add_argument("--number_of_iterations","-ni",type=int,default=50)
    parser.add_argument("--error_threshold","-et",type=int,default=30)
    parser.add_argument("--silent_query_info_file","-sf",default="/infolab/node4/lukuang/2015-RTS/disk4-5/eval/silent_query_info")
    parser.add_argument("--retrieval_method","-rm",choices=list(map(int, RetrievalMethod)),default=0,type=int,
        help="""
            Choose the retrieval method:
                0:f2exp
                1:dirichlet
                2:pivoted
                3:bm25
        """)
    parser.add_argument("--use_auc","-ua",action="store_true")
    parser.add_argument("--title_only","-to",action="store_true")
    parser.add_argument("--metric_string","-ms",default="P_10")
    parser.add_argument("tree_store_dir")
    args=parser.parse_args()

    index_type = IndexType.processed
    eval_data = EvalData(index_type,args.metric_string)
    args.retrieval_method = RetrievalMethod(args.retrieval_method)
    result_dir = R_DIR[index_type][args.retrieval_method]
    print "result dir %s" %(result_dir)
    result_files = get_result_files(result_dir)
    query_data_file = os.path.join(args.tree_estimator_directory,index_type.name,args.retrieval_method.name)
    query_data_file = os.path.join(query_data_file,"data")
    print "get value pair %s" %(query_data_file)
    values = json.load(open(query_data_file))

    all_metrics = {}
    for day in values:
        all_metrics[day] =  eval_data.get_metric(result_files[day])


    silent_query_info = json.load(open(args.silent_query_info_file))
    # print all_metrics
    title_query_data = []
    desc_query_data = []
    query_data = []
    silent_judgments = []

    silent_days = {}
    day = "10"
    for qid in values.values()[0].keys():
        # m = re.search("^(\d+)_",qid)
        # if m:
        #     q_num = int(m.group(1))
        #     if q_num > 650:
        #         continue
        # else:
        #     raise RuntimeError("Mal qid format %s" %(qid))
        day_qid = "10_%s" %(qid)
        # print day_qid
        
        
        # print results[day]
        if args.title_only:
            if "title" not in qid:
                continue
        if qid in all_metrics[day]:
            
            day_query_metric = all_metrics[day][qid]

            m = re.search("^(\d+)_",qid)
            if m:
                q_num = m.group(1)
            else:
                raise RuntimeError("Mal qid format %s" %(qid))
            
            if q_num in silent_query_info :
                silent_days[day_qid] = 1
            else:
                silent_days[day_qid] = 0
        
        else: 
            print "%s query has no metric!" %(qid)
            day_query_metric = .0
            silent_days[day_qid] = 1

        single_data = {}
        single_data["day_qid"] = day_qid
        single_data["metric"] = day_query_metric
        single_data["values"] = values[day][qid]

        if "title" in qid:
            title_query_data.append(single_data)
        else:
            desc_query_data.append(single_data)

        query_data.append(single_data)
        silent_judgments.append( silent_days[day_qid] )
        
    title_tree = load_tree(args.tree_store_dir,QueryPart.title,args.retrieval_method,args.metric_string)
    title_predicted = title_tree.output_result(title_query_data)
    if not args.title_only:
        desc_tree = load_tree(args.tree_store_dir,QueryPart.desc,args.retrieval_method,args.metric_string)
        desc_predicted = desc_tree.output_result(desc_query_data)


    # print "There are %d queries" %(len(query_data))
    # print "%d of them are silent" %(sum(silent_judgments))


    print "There are %d samples" %(len(query_data))
    # print thresholds
        
    num_of_split = 10
    f1_macro_average = .0
    f1_average = .0
    skf = StratifiedKFold(n_splits=num_of_split,shuffle=True)
    for training_index, test_index in skf.split(query_data, silent_judgments):
        all_training_data = []
        training_title_query_data = []
        training_desc_query_data = []


        # print "%d training %d testing" %(len(training_index),len(test_index))
        for i in training_index:
            single_data = deepcopy(query_data[i])
            day_qid = single_data["day_qid"]
                    
            all_training_data.append(single_data )
            if "title" in day_qid:
                training_title_query_data.append(single_data)
            else:
                if not args.title_only:
                    training_desc_query_data.append(single_data)
        
        train_title_predicted = title_tree.output_result(training_title_query_data)
        if not args.title_only:
            train_desc_predicted = desc_tree.output_result(training_desc_query_data)
        else:
            train_desc_predicted = {0:0}
        thresholds = get_threshold(train_title_predicted.values(),train_desc_predicted.values(),args.title_only)
        best_tree_threshold = {}
        best_f1_score = -1000
        best_f1_threshold = .0
        for threshold in thresholds:
            sub_training_data = []
            training_pre_y_true = []
            training_pre_y_score = []
            for single_data in all_training_data:
                day_qid = single_data["day_qid"]
                if "title" in day_qid:
                    if (title_predicted[day_qid] <= threshold["title"]):
                        
                        sub_training_data.append(single_data )
                    else:
                        training_pre_y_score.append(1000)
                        training_pre_y_true.append(silent_days[day_qid])
                else:
                    if not args.title_only:
                        if (desc_predicted[day_qid]  <= threshold["desc"]):
                            sub_training_data.append(single_data) 
                        else:
                            training_pre_y_score.append(1000)
                            training_pre_y_true.append(silent_days[day_qid])


            forest = Forest(sub_training_data,args.error_threshold,args.number_of_iterations)
            forest.start_training()

            training_predicted_values = forest.output_result(sub_training_data)
            training_y_true, training_y_score = make_score_prediction_lists(training_predicted_values,silent_days)
            training_y_true  = training_pre_y_true + training_y_true
            training_y_score  = training_pre_y_score + training_y_score
            threshold_best_f1_threshold,theshold_best_f1_score = get_best_f1_threshold(training_y_true, training_y_score)
            if theshold_best_f1_score > best_f1_score:
               best_tree_threshold =  threshold
               best_f1_score = theshold_best_f1_score
               best_f1_threshold = threshold_best_f1_threshold
        
        print "best f1 threshold:%f, best f1 %f:" %(best_f1_threshold,best_f1_score)
        print best_tree_threshold

        testing_data = []
        testing_pre_y_true = []
        testing_pre_y_score = []

        for j in test_index:
            single_data = deepcopy(query_data[j])
            day_qid = single_data["day_qid"]
                    

            if "title" in day_qid:
                if (title_predicted[day_qid] <= best_tree_threshold["title"]):
                        
                    testing_data.append(single_data )
                else:
                    testing_pre_y_score.append(1000)
                    testing_pre_y_true.append(silent_days[day_qid])
            else:
                if not args.title_only:
                    if (desc_predicted[day_qid] <= best_tree_threshold["desc"]):
                            
                        testing_data.append(single_data )
                    else:
                        testing_pre_y_score.append(1000)
                        testing_pre_y_true.append(silent_days[day_qid])

        # test_forest = Forest(testing_data,args.error_threshold,args.number_of_iterations)
        # test_forest.start_training()

        test_predicted_values = forest.output_result(testing_data)
        testing_y_true, testing_y_score = make_score_prediction_lists(test_predicted_values,silent_days)
        testing_y_true  = testing_pre_y_true + testing_y_true
        testing_y_score  = testing_pre_y_score + testing_y_score
        test_y_predict = []
        for single_score in testing_y_score:
            if single_score < best_f1_threshold:
                test_y_predict.append(1)
            else:
                test_y_predict.append(0)
        f1_macro_average += f1(testing_y_true, test_y_predict,average="macro")/(1.0*num_of_split)
        f1_average += f1(testing_y_true, test_y_predict)/(1.0*num_of_split)

    

    print "Positive f1: %f" %(f1_average)
    print "Average f1: %f" %(f1_macro_average)
    print "-"*20