Example #1
0
def get_cado_predictions():
    data_path = '../../datasets/cado/train.csv'
    test_path = '../../datasets/cado/test.csv'

    data = du.load_data(data_path)
    test = du.load_data(test_path)

    text_index = 6
    label_start_index = 7
    X = [d[text_index] for d in data]
    labels = [d[label_start_index:label_start_index + 12] for d in data]

    X_test = [d[text_index] for d in test]
    labels_test = [d[label_start_index:label_start_index + 12] for d in test]

    Y = np.array(labels, dtype='int')
    y_test = np.array(labels_test, dtype='int')
    #Y = np.array(binary_labels, dtype='int')

    test_index = len(X)

    X = X + X_test
    Y = np.vstack([Y, y_test])

    tokenizer = tokenize_data(X)
    word_index = tokenizer.word_index

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences,
                      maxlen=700,
                      padding="post",
                      truncating="post",
                      value=0)

    num_words = min(MAX_NB_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, 1))

    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_matrix[i] = 1

    X_train = X[0:test_index, :]
    Y_train = Y[0:test_index, :]
    x_test = X[test_index:len(X), :]
    y_test = Y[test_index:len(Y), :]

    classifier = MLkNN()
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(x_test)
    scores = classifier.predict_proba(x_test)
    y_pred = predictions.toarray()
    y_score = scores.toarray()

    return y_pred, y_score
Example #2
0
    def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """ML KNN算法"""
        classifier = MLkNN(k=train_data_y.shape[1])
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data).todense()
        """预测结果转化为data array"""
        predictions = numpy.asarray(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
Example #3
0
def run():
    parser = get_arg_parser()
    cmd_args = parser.parse_args()

    if cmd_args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu)
        gpunum = os.getenv('CUDA_VISIBLE_DEVICES')
        logging.info("GPU has been set to {}".format(gpunum))

    logging.info("Model used for the regression network: {}"
                 .format(cmd_args.model_name))

    # 1. Dataset retrieval
    # --------------------

    tab_printer(constants.Dataset)
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)

    logging.info("Going to create vocabulary and fit a preprocessing pipeline"
                 "using {} samples. Settings will be listed below"
                 .format(len(dataset.X_train)))

    # 2. Preprocessing
    # -----------------

    tab_printer(constants.NLP)
    preprocessor = Preprocessing(dataset.X_train)

    # Preprocess documents
    X_train = preprocessor.transform_documents(dataset.X_train)
    X_test = preprocessor.transform_documents(dataset.X_test)

    # 3. Word embeddings with word2vec
    # --------------------------------

    # Train word2vec embeddings if train_word2vec option is selected
    if cmd_args.train_word2vec: utils.embeddings.main()
    weights = get_embedding_tensor(preprocessor)

    # 4. Node embeddings with AttentionWalk
    # -------------------------------------
    args = _generate_deepwalk_parameters(dataset.y_train_graph)
    if cmd_args.train_attentionwalk: train_attention_walk(args)

    graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values

    # Get document representations using node embeddings
    y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings)
    y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings)

    # 5. Regressor Training
    # ---------------------

    device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \
        if torch.cuda.is_available() else 'cpu'

    regressor_nn = NeuralNet(
        get_network_class(cmd_args.model_name),
        max_epochs=constants.NeuralNetworkTraining.epochs,
        lr=constants.NeuralNetworkTraining.learning_rate,
        batch_size=constants.NeuralNetworkTraining.batch_size,
        optimizer=torch.optim.Adam,
        criterion=torch.nn.MSELoss,

        module__output_dim=args.dimensions,
        module__embedding=weights,
        module__embedding_dim=constants.NLP.embedding_size,

        device=device,
        train_split=None,
    )

    # Train the regressor neural network
    regressor_nn.fit(X_train, y_embedded.astype(np.float32))

    # 6. Train Multi-label KNN algorithm
    # ----------------------------------

    tab_printer(constants.MLKNN)

    # Train multi-label KNN to turn label embeddings into label predictions
    classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s)
    classifier.fit(y_embedded, dataset.y_train)

    # 7. Evaluation
    # -------------

    # Label prediction with documents
    y_test_pred = regressor_nn.predict(X_test)
    preds = classifier.predict(y_test_pred)
    preds_raw = classifier.predict_proba(y_test_pred)

    # Label prediction with label embeddings
    preds_w_labels = classifier.predict(y_test_embedded)
    preds_w_labels_raw = classifier.predict_proba(y_test_embedded)

    # Log evaluation result with label embeddings
    eval_metrics_w_labels = evaluation \
        .all_metrics(preds_w_labels.toarray(),
                     dataset.y_test,
                     yhat_raw=preds_w_labels_raw.toarray())

    logging.info(str(eval_metrics_w_labels))

    # Log evaluation result with documents
    report_evaluation(preds.toarray(),
                      dataset.y_test,
                      yhat_raw=preds_raw.toarray())
Example #4
0
def mlknn_train_pred(k_list,
                     df_train_x,
                     df_train_y,
                     df_test_x,
                     df_test_y,
                     target_cols,
                     NFOLDS=5):
    """
    This function z-score normalizes the train and test data, split the train data in K-folds and run the 
    Multilabel KNN on the folds to choose the best "K", thereafter predicting on the K-fold train data and
    test set using the Best K, averaging out the predictions across all folds for the test set.
    
    Args:
            k_list: A list of "K" nearest neighbours to perform gridsearch on
            df_train_x: train data with only phenotypic/morphological features - pandas dataframe.
            df_train_y: train data with only the MOA (Mechanism of actions) target labels - pandas dataframe.
            df_test_x: test data with only phenotypic/morphological features - pandas dataframe.
            df_test_y: test data with only the MOA (Mechanism of actions) target labels- pandas dataframe.
            target_cols: A list of MOA (Mechanism of actions) target labels
            NFOLDS: A value that represent number of K-subset/cross-validation we want to perform
    
    Returns:
            oof_preds: Train out-of-fold predictions - pandas dataframe.
            test_preds: Test predictions - pandas dataframe.

    """

    sc = StandardScaler()
    df_train_x_scaled = pd.DataFrame(sc.fit_transform(df_train_x),
                                     columns=df_train_x.columns)
    df_test_x_scaled = pd.DataFrame(sc.transform(df_test_x),
                                    columns=df_test_x.columns)

    acc_losses = []
    oof_preds = pd.DataFrame(np.zeros(shape=(df_train_y.shape)),
                             columns=target_cols)
    test_preds = pd.DataFrame(np.zeros(shape=(df_test_y.shape)),
                              columns=target_cols)
    skf = MultilabelStratifiedKFold(n_splits=NFOLDS,
                                    shuffle=True,
                                    random_state=133)

    print('Execution time | Fold number | logloss | Best K |')
    for fn, (trn_idx,
             val_idx) in enumerate(skf.split(df_train_x_scaled, df_train_y)):
        start_time = time()
        X_train, X_val = df_train_x_scaled.loc[
            trn_idx, :], df_train_x_scaled.loc[val_idx, :]
        y_train, y_val = df_train_y.iloc[trn_idx, :], df_train_y.iloc[
            val_idx, :]

        best_k = 0
        best_loss = np.inf
        for k_item in k_list:
            classifier = MLkNN(k=k_item)
            classifier.fit(X_train.values, y_train.values)
            val_preds = classifier.predict_proba(X_val.values)
            loss = log_loss(np.ravel(y_val), np.ravel(val_preds.toarray()))
            if loss < best_loss:
                best_loss = loss
                best_k = k_item
                oof_preds.iloc[val_idx, :] = val_preds.toarray()

        classifier = MLkNN(k=best_k)
        classifier.fit(X_train.values, y_train.values)
        acc_losses.append(best_loss)
        preds = classifier.predict_proba(df_test_x_scaled.values)
        test_preds += preds.toarray() / NFOLDS
        print('{}\t\t{}\t\t{:.5f}\t\t{}'.format(
            str(datetime.timedelta(seconds=time() - start_time))[:7], fn, loss,
            best_k))

    return oof_preds, test_preds
k=pd.DataFrame(predicts.todense())
ss[TARGET_COLS] = k
ss.to_csv(r"C:\Users\Sheeja Ayoob\Desktop\hacklive_NLP_sub7.csv", index = False)
--------------------------------------------------------------------------------------------

#optimal threshold
def get_best_thresholds(true, preds):
  thresholds = [i/100 for i in range(100)]
  best_thresholds = []
  for idx in range(25):
    f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1) for thresh in thresholds]
    best_thresh = thresholds[np.argmax(f1_scores)]
    best_thresholds.append(best_thresh)
  return best_thresholds

val_preds = mlknn_classifier.predict_proba(X_val_tfidf)
val_preds=val_preds.toarray()

best_thresholds = get_best_thresholds(y_val,val_preds)

for i, thresh in enumerate(best_thresholds):
  val_preds[:, i] = (val_preds[:, i] > thresh) * 1
  
f1_score(y_val, val_preds, average='micro')


preds_test = mlknn_classifier.predict_proba(X_test1_tfidf)

for i, thresh in enumerate(best_thresholds):
  preds_test[:, i] = (preds_test[:, i] > thresh) * 1