Ejemplo n.º 1
0
def results_from_folder(folder_name, out_obj_folder, file_keyword, num_classes,
                        line_keyword):
    file_list = list_files(folder_name)
    file_count = 0
    for file_name in file_list:
        if file_name.startswith('.'):
            continue
        if file_keyword not in file_name:
            continue
        print file_name
        file_count = file_count + 1
        feature_matrix = results_from_file(folder_name + file_name,
                                           line_keyword)
        print feature_matrix.shape
        out_obj_file = file_name.split('.')[0] + "_top15.out"
        save_obj([feature_matrix], out_obj_folder + out_obj_file)
Ejemplo n.º 2
0
def run_dcpc_main(data_folder,
                  class_column,
                  num_classes,
                  obj_folder,
                  threshold,
                  logger=None):
    if logger == None:
        logger = init_logging('')

    file_list = list_files(data_folder)
    overall_time = 0

    file_count = 0
    out_obj_dict = {}
    for train_file in file_list:
        if "train_" not in train_file:
            continue
        logger.info(train_file)
        out_obj_file = train_file.replace('.txt', '_dcpc.obj')
        file_count = file_count + 1

        test_file = train_file.replace('train_', 'test_')

        x_matrix, y_vector = file_read_split(data_folder + train_file)
        min_class = min(y_vector)
        max_class = max(y_vector) + 1
        #logger.info("x matrix tran after shape: " + str(x_matrix.shape))
        #x_matrix = x_matrix.transpose((0, 2, 1))
        logger.info("x matrix tran after shape: " + str(x_matrix.shape))
        for label in range(min_class, max_class):
            label_index = np.where(y_vector == label)[0]
            label_x_matrix = x_matrix[label_index, :, :]
            logger.info("class: " + str(label))
            print "class: " + str(label)
            logger.info("x matrix tran before shape: " +
                        str(label_x_matrix.shape))
            label_dcpc = computeDCPC(label_x_matrix, threshold)
            logger.info("class: " + str(label) + " dcpc shape: " +
                        str(label_dcpc.shape))
            out_obj_dict[label] = label_dcpc
        logger.info("dcpc out obj: " + str(obj_folder + out_obj_file))
        save_obj([out_obj_dict], obj_folder + out_obj_file)
Ejemplo n.º 3
0
def run_dcpc_processing(dcpc_folder, num_classes, method=0, logger=None):
    logger.info('obj folder:' + dcpc_folder)
    dcpc_list = list_files(dcpc_folder)
    logger.info(dcpc_list)
    score_folder = dcpc_folder[:-1] + "_score/"
    score_folder = init_folder(score_folder)
    for dcpc_obj in dcpc_list:
        dcpc = load_obj(dcpc_folder + dcpc_obj)[0]
        if method == 0:
            out_label_array = []
            out_label_dict = {}
            for label in range(0, num_classes):
                logger.info('class: ' + str(label))
                label_dcpc = dcpc[label]
                logger.info("dcpc shape: " + str(label_dcpc.shape))
                attr_score = clever_rank(label_dcpc, logger)
                logger.info(attr_score)
                sorted_dict = sorted(attr_score.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
                sorted_attr = []
                for item in sorted_dict:
                    sorted_attr.append(item[0])
                #label_array = []
                #for label in range(0, num_classes):
                #    class_array = sorted_attr
                #    label_array.append(class_array)
                out_label_array.append(sorted_attr)
                out_label_dict[label] = attr_score
                logger.info(sorted_attr)
                logger.info(attr_score)
            save_obj([out_label_array, out_label_dict],
                     score_folder + dcpc_obj)

            logger.info("score obj: " + score_folder + dcpc_obj)

    return score_folder
Ejemplo n.º 4
0
def global_cnn_lda_feature_main(parameter_file, method):
    data_keyword, data_folder, attr_num, attr_len, class_column, start_class, num_classes, pckl_folder, log_folder, log_postfix, out_obj_folder = read_global_feature_generation_parameter(
        parameter_file)

    log_file = log_folder + data_keyword + '_' + method + log_postfix
    #log_file = '' # without write to file
    logger = init_logging(log_file)
    logger.info('METHOD: ' + method)
    logger.info('DATA KEYWORD: ' + data_keyword)
    logger.info('ATTRIBUTE NUMBER: ' + str(attr_num))
    logger.info('ATTRIBUTE LENGTH: ' + str(attr_len))
    logger.info('CLASS NUMBER: ' + str(num_classes))
    logger.info('CLASS COLUMN: ' + str(class_column))
    logger.info('START CLASS: ' + str(start_class))
    logger.info('PCKL FOLDER: ' + pckl_folder)
    logger.info('LOG FOLDER: ' + log_folder)
    logger.info('LOG POSTFIX: ' + log_postfix)
    logger.info('OUTPUT FOLDER: ' + out_obj_folder)

    function_name = sys._getframe(1).f_code.co_name
    logger = init_logging(log_file)

    file_list = listFiles(pckl_folder)
    overall_time = 0

    ret_feature_array = []
    file_count = 0
    for train_file_pckl in file_list:
        if "train" not in train_file_pckl:
            continue
        train_file = train_file_pckl[0:train_file_pckl.index('.txt')] + '.txt'
        logger.info("PCKL FILE: " + train_file_pckl)
        logger.info("DATA FILE: " + train_file)

        train_x_matrix, train_y_vector = readFile(data_folder + train_file)

        out_matrix, weight_matrix, bias_vector = load_obj(pckl_folder +
                                                          train_file_pckl)
        out_matrix = np.squeeze(out_matrix)
        row_num, attr_num, attr_len = out_matrix.shape
        out_matrix = out_matrix.reshape(row_num, attr_num * attr_len)
        if file_count == 0:
            logger.info('layer out matrix shape: ' + str(out_matrix.shape))
            logger.info('weight matrix shape: ' + str(weight_matrix.shape))
            logger.info('bias vector shape: ' + str(bias_vector.shape))

        feature_index_vector, run_time = gene_global_lda_feature(
            out_matrix, train_y_vector, attr_num, logger)
        overall_time = overall_time + run_time
        logger.info(feature_index_vector.shape)
        ret_feature_array.append(feature_index_vector)

        file_count = file_count + 1
        break
        #if file_count > 1:
        #    break

    ret_feature_array = np.matrix(ret_feature_array)
    logger.info(ret_feature_array.shape)
    logger.info("return feature array samples:")
    logger.info("\n" + str(ret_feature_array[0:4, 0:6]))

    start_time = time.time()
    ret_feature_index, ret_feature_value = majority_vote_index(
        ret_feature_array, -1)
    overall_time = overall_time + time.time() - start_time
    logger.info("\n" + str(ret_feature_index[0:6]))
    logger.info(method + " global feature run time (sec): " +
                str(overall_time))
    obj_file = out_obj_folder + method + "_global_feature.pckl"
    save_obj(ret_feature_index, obj_file)
    return ret_feature_index, overall_time
Ejemplo n.º 5
0
def global_lda_pca_feature_main(parameter_file, method):
    data_keyword, data_folder, attr_num, attr_len, class_column, start_class, num_classes, pckl_folder, log_folder, log_postfix, out_obj_folder = read_global_feature_generation_parameter(
        parameter_file)

    log_file = log_folder + data_keyword + '_' + method + log_postfix
    #log_file = '' # without write to file
    logger = init_logging(log_file)
    logger.info('METHOD: ' + method)
    logger.info('DATA KEYWORD: ' + data_keyword)
    logger.info('ATTRIBUTE NUMBER: ' + str(attr_num))
    logger.info('ATTRIBUTE LENGTH: ' + str(attr_len))
    logger.info('CLASS NUMBER: ' + str(num_classes))
    logger.info('CLASS COLUMN: ' + str(class_column))
    logger.info('START CLASS: ' + str(start_class))
    logger.info('PCKL FOLDER: ' + pckl_folder)
    logger.info('LOG FOLDER: ' + log_folder)
    logger.info('LOG POSTFIX: ' + log_postfix)
    logger.info('OUTPUT FOLDER: ' + out_obj_folder)

    function_name = sys._getframe(1).f_code.co_name
    logger = init_logging(log_file)

    file_list = listFiles(data_folder)
    overall_time = 0

    ret_feature_array = []

    file_count = 0
    #method = 'pca'
    for train_file in file_list:
        if "train" not in train_file:
            continue

        logger.info(train_file)

        lda_feature_array = []

        x_matrix, y_vector = readFile(data_folder + train_file, class_column)

        if file_count == 0:
            logger.info("x data matrix shape: " + str(x_matrix.shape))
            logger.info("y vector shape: " + str(y_vector.shape))

        row_num, col_num = x_matrix.shape
        attr_len = col_num / attr_num
        if method == 'lda':
            feature_index_vector, run_time = gene_global_lda_feature(
                x_matrix, y_vector, attr_num, logger)
        elif method == 'pca':
            feature_index_vector, run_time = gene_global_pca_feature(
                x_matrix, attr_num, logger)
        overall_time = overall_time + run_time
        ret_feature_array.append(feature_index_vector)

        file_count = file_count + 1
        #break
        #if file_count > 1:
        #    break

    ret_feature_array = np.matrix(ret_feature_array)
    logger.info(ret_feature_array.shape)
    logger.info("ret_feature_array samples:")
    logger.info("\n" + str(ret_feature_array[0:4, :]))

    start_time = time.time()
    ret_feature_index, ret_feature_value = majority_vote_index(
        ret_feature_array, -1)
    overall_time = overall_time + time.time() - start_time
    logger.info("\n" + str(ret_feature_index[0:6]))
    logger.info("global feature run time (sec): " + str(overall_time))
    obj_file = out_obj_folder + '_' + method + "_global_feature.pckl"
    logger.info("global feature saved to: " + str(obj_file))
    save_obj(ret_feature_index, obj_file)
    return ret_feature_index, overall_time
Ejemplo n.º 6
0
def multi_proj_feature_classification(
        parameter_file,
        file_keyword,
        function_keyword="multi_proj_feature_classification"):
    data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification(
        parameter_file, function_keyword)
    log_folder = init_folder(log_folder)
    if method == 'cnn':
        return projected_cnn_classification_main(parameter_file, file_keyword)

    else:
        # Need to check the rest
        return False

    print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file
    data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len,
                                 class_column)
    print obj_folder
    file_list = list_files(data_folder)
    obj_list = list_files(obj_folder)

    class_column = 0
    header = True

    save_obj_folder = obj_folder[:-1] + "_" + method + "_out"
    save_obj_folder = init_folder(save_obj_folder)

    delimiter = ' '
    loop_count = -1
    for train_file in file_list:
        if file_keyword not in train_file:
            continue
        loop_count = loop_count + 1
        file_key = train_file.replace('.txt', '')
        log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(
            class_id) + '_top' + str(top_k) + '_' + method + '.log'

        print "log file: " + log_file
        logger = setup_logger(log_file, 'logger_' + str(loop_count))
        logger.info('\nlog file: ' + log_file)
        logger.info(train_file)
        logger.info('method: ' + method)
        logger.info('============')

        found_obj_file = ''
        for obj_file in obj_list:
            if file_key in obj_file:
                found_obj_file = obj_file
                break
        if found_obj_file == '':
            raise Exception('No obj file found')

        print found_obj_file
        found_obj_file = obj_folder + found_obj_file

        feature_array = load_obj(found_obj_file)[0]
        feature_array = np.array(feature_array)
        logger.info("feature array shape: " + str(feature_array.shape))

        test_file = train_file.replace('train', 'test')

        train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum(
            data_folder + train_file, data_folder + test_file, class_column,
            delimiter, header)

        if loop_count == 0:
            logger.info('train matrix shape: ' + str(train_x_matrix.shape))
            logger.info('train label shape: ' + str(train_y_vector.shape))
            logger.info('test matrix shape: ' + str(test_x_matrix.shape))
            logger.info('test label shape: ' + str(test_y_vector.shape))

        train_x_matrix = train_test_transpose(train_x_matrix, attr_num,
                                              attr_len, False)
        test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len,
                                             False)

        data_stru.attr_num = top_k
        fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_classification(
            train_x_matrix, train_y_vector, test_x_matrix, test_y_vector,
            feature_array, top_k, method, class_id, logger)

        logger.info("Fold F1: " + str(fold_f1_value))
        logger.info(method + ' fold training time (sec):' +
                    str(fold_train_time))
        logger.info(method + ' fold testing time (sec):' + str(fold_test_time))
        logger.info(method + ' fold accuracy: ' + str(fold_accuracy))
        logger.info("save obj to " + save_obj_folder + file_key + "_" +
                    method + "_project_" + method + "_result.ckpt")
        save_obj([
            fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time,
            fold_test_time, fold_predict_matrix
        ], save_obj_folder + file_key + "_" + method + "_project_" + method +
                 "_result.ckpt")
Ejemplo n.º 7
0
def run_cnn_projected_feature_analysis(feature_folder,
                                       class_id,
                                       data_folder,
                                       data_file_keyword,
                                       method="rf_lda",
                                       log_folder='./'):
    data_file_list = list_files(data_folder)
    feature_file_list = list_files(feature_folder)
    out_obj_folder = feature_folder[:-1] + "_" + method
    out_obj_folder = init_folder(out_obj_folder)
    class_column = 0

    for train_file in data_file_list:
        if data_file_keyword not in train_file:
            continue
        data_key = train_file.replace('.txt', '')
        data_matrix, attr_num = file_reading(data_folder + train_file)
        train_x_matrix, train_y_vector = x_y_spliting(data_matrix,
                                                      class_column)
        #train_y_vector = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 3])
        if class_id < 0:
            min_class = min(train_y_vector)
            max_class = max(train_y_vector) + 1
        else:
            min_class = class_id
            max_class = min_class + 1
        log_file = data_key + "_" + method + "_min" + str(
            min_class) + "_max" + str(max_class) + ".log"
        logger = setup_logger(log_folder + log_file)
        logger.info('data file: ' + train_file)
        out_obj_file = data_key + "_" + method + "_min" + str(
            min_class) + "_max" + str(max_class) + ".obj"
        out_obj_matrix = []
        for label in range(min_class, max_class):
            logger.info("class: " + str(label))
            feature_key = "_class" + str(label) + "_"
            for feature_file in feature_file_list:
                if data_key not in feature_file or feature_key not in feature_file:
                    continue
                logger.info("feature file: " + feature_file)
                feature_obj = load_obj(feature_folder + feature_file)
                train_feature = obj_processing(feature_obj[0])
                logger.info("train feature shape: " + str(train_feature.shape))
                class_train_y = np.where(train_y_vector == label, 1, 0)
                logger.info("feature method: " + str(method))
                if method == "rf_lda_sum":
                    class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_lda_analysis(
                        train_feature, class_train_y, logger)
                elif method == "rf":
                    class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_analysis(
                        train_feature, class_train_y, logger)
                elif method == "lda":
                    class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_lda_analysis(
                        train_feature, class_train_y, logger)
                elif method == "cpca":
                    class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_cpca_analysis(
                        train_feature, class_train_y, logger)
                if method == "cpca":
                    class_attr_list = class_attr_imp_matrix
                else:
                    logger.info("class attr imp matrix shape: " +
                                str(class_attr_imp_matrix.shape))
                    class_attr_list = map_attr_imp_analysis(
                        class_attr_imp_matrix, logger)
                logger.info(class_attr_list)
                out_obj_matrix.append(class_attr_list)
        out_obj_matrix = np.array(out_obj_matrix)
        logger.info("out obj to: " + out_obj_folder + out_obj_file)
        logger.info(out_obj_matrix.shape)
        save_obj([out_obj_matrix], out_obj_folder + out_obj_file)
Ejemplo n.º 8
0
def cnn_train(train_x_matrix,
              train_y_matrix,
              test_x_matrix,
              test_y_matrix,
              num_classes,
              cnn_setting,
              input_x_placeholder,
              output_y_placeholder,
              logits_out,
              keep_prob,
              keeped_feature_list,
              saver_file="./",
              logger=None):
    if logger is None:
        logger = init_logging('')
    min_class = 0
    eval_method = cnn_setting.eval_method
    batch_size = cnn_setting.batch_size
    stop_threshold = cnn_setting.stop_threshold
    max_iter = cnn_setting.max_iter
    feature_method = cnn_setting.feature_method
    feature_obj_file = cnn_setting.out_obj_folder + saver_file
    saver_file = cnn_setting.out_model_folder + saver_file
    predict_y_proba = tf.nn.softmax(logits_out)
    prediction = tf.argmax(predict_y_proba, 1)
    actual = tf.argmax(output_y_placeholder, 1)
    correct_prediction = tf.equal(prediction, actual)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    if eval_method == 'f1':
        train_y_vector = np.argmax(train_y_matrix, axis=1)
        train_class_index_dict, train_min_length, train_max_length = class_label_vector_checking(
            train_y_vector)
        min_class = 0
        max_class = max(train_y_vector)
        num_classes = max_class + 1
        if max_class == 1:
            TP = tf.count_nonzero(prediction * actual, dtype=tf.float32)
            TN = tf.count_nonzero((prediction - 1) * (actual - 1),
                                  dtype=tf.float32)
            FP = tf.count_nonzero(prediction * (actual - 1), dtype=tf.float32)
            FN = tf.count_nonzero((prediction - 1) * actual, dtype=tf.float32)
            precision = (TP) / (TP + FP)
            recall = (TP) / (TP + FN)
            f1 = (2 * precision * recall) / (precision + recall)
            eval_method_value = f1
            eval_method_keyword = "f1"
        else:
            eval_method_value = accuracy
            eval_method_keyword = "acc with batch"
        coefficient_placeholder = tf.placeholder(tf.float32,
                                                 shape=[num_classes])
        cross_entropy = tf.reduce_mean(
            tf.nn.weighted_cross_entropy_with_logits(
                targets=output_y_placeholder,
                logits=logits_out,
                pos_weight=coefficient_placeholder))
    else:
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=output_y_placeholder, logits=logits_out))
        eval_method_value = accuracy
        eval_method_keyword = "acc"
    #print cross_entropy.get_shape()
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    cnn_session = tf.InteractiveSession()
    cnn_session.run(tf.global_variables_initializer())

    test_eval_value = 0
    best_eval_value = 0
    i = 0
    start = 0
    epoch = 0
    end = batch_size
    batch_each_class = int(batch_size / num_classes)
    overall_len = len(train_y_matrix)
    saver = tf.train.Saver()
    train_run_time = 0
    np.random.seed(epoch)
    batch_index = np.random.permutation(overall_len)
    logger.info("Random Epoch:" + str(epoch) + str(batch_index[0:5]))
    f1_unbalance_count = np.zeros(num_classes)
    second_chance = False
    re_init = False
    while (test_eval_value < stop_threshold):
        if start >= overall_len:
            start = 0
            end = start + batch_size
            epoch = epoch + 1
            np.random.seed(epoch)
            logger.info("Random Epoch:" + str(epoch) + str(batch_index[0:5]))
            batch_index = np.random.permutation(overall_len)
        elif end > overall_len:
            end = overall_len
        batch_x_matrix = train_x_matrix[batch_index[start:end], :, :, :]
        batch_y_matrix = train_y_matrix[batch_index[start:end], :]

        #print 'batch_x_matrix shape'
        #print batch_x_matrix.shape
        #print batch_y_matrix.shape
        if eval_method == 'f1':
            if i == 0:
                logger.info("Batch controlled")
            ### Normal BATCH Weight
            #batch_y_vector = np.argmax(batch_y_matrix, axis=1)
            #batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(batch_y_vector)
            #coefficients_vector = []
            #batch_class_index_dict_keys = batch_class_index_dict.keys()
            #for c_label in range(min_class, max_class+1):
            #    if c_label not in batch_class_index_dict_keys:
            #        add_index_vector_len = 0.1
            #    else:
            #        add_index_vector_len = len(batch_class_index_dict[c_label])
            #    coefficients_vector.append(float(batch_max_length)/float(add_index_vector_len))
            #coefficients_vector = np.array(coefficients_vector)
            ### End of Normal BATCH Weight
            # BATCH_CONTROLLED
            batch_y_vector = np.argmax(batch_y_matrix, axis=1)
            batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(
                batch_y_vector)
            if i < 3:
                logger.info("class index before: ")
                logger.info(batch_class_index_dict)
            coefficients_vector = []
            batch_class_index_dict_keys = batch_class_index_dict.keys()
            for c_label in range(min_class, max_class + 1):
                #print "class: " + str(c_label)
                #print class_label_vector_checking
                if c_label not in batch_class_index_dict_keys:
                    f1_unbalance_count[
                        c_label] = f1_unbalance_count[c_label] + 1
                    c_label_index = train_class_index_dict[c_label]
                    c_label_index_len = len(c_label_index)
                    add_index_vector_len = 0
                    if c_label_index_len > batch_each_class:
                        add_index_vector = np.random.choice(c_label_index_len,
                                                            batch_each_class,
                                                            replace=False)
                        if (i < 3):
                            logger.info("add index vector for c " +
                                        str(c_label))
                            logger.info(add_index_vector)
                        add_index_vector_len = len(add_index_vector)
                        batch_x_matrix = np.concatenate(
                            (batch_x_matrix, train_x_matrix[
                                c_label_index[add_index_vector], :, :, :]),
                            axis=0)
                        batch_y_matrix = np.concatenate(
                            (batch_y_matrix,
                             train_y_matrix[c_label_index[add_index_vector], :]
                             ),
                            axis=0)
                    else:
                        batch_x_matrix = np.concatenate(
                            (batch_x_matrix,
                             train_x_matrix[c_label_index, :, :, :]),
                            axis=0)
                        batch_y_matrix = np.concatenate(
                            (batch_y_matrix, train_y_matrix[c_label_index, :]),
                            axis=0)
                        add_index_vector_len = c_label_index_len
                else:
                    batch_class_index = batch_class_index_dict[c_label]
                    add_index_vector_len = len(batch_class_index)
                    c_label_index = train_class_index_dict[c_label]
                    c_label_index_len = len(c_label_index)
                    if add_index_vector_len < batch_each_class:
                        add_count = batch_each_class - add_index_vector_len
                        if c_label_index_len > add_count:
                            add_index_vector = np.random.choice(
                                c_label_index_len, add_count, replace=False)
                            if (i < 3):
                                logger.info("add index vector for c " +
                                            str(c_label))
                                logger.info(add_index_vector)
                            add_index_vector_len = add_index_vector_len + len(
                                add_index_vector)
                            batch_x_matrix = np.concatenate(
                                (batch_x_matrix, train_x_matrix[
                                    c_label_index[add_index_vector], :, :, :]),
                                axis=0)
                            batch_y_matrix = np.concatenate(
                                (batch_y_matrix, train_y_matrix[
                                    c_label_index[add_index_vector], :]),
                                axis=0)
                        else:
                            batch_x_matrix = np.concatenate(
                                (batch_x_matrix,
                                 train_x_matrix[c_label_index, :, :, :]),
                                axis=0)
                            batch_y_matrix = np.concatenate(
                                (batch_y_matrix,
                                 train_y_matrix[c_label_index, :]),
                                axis=0)
                            add_index_vector_len = add_index_vector_len + c_label_index_len
                    elif add_index_vector_len > 2 * batch_each_class:
                        remove_count = (add_index_vector_len -
                                        2 * batch_each_class)
                        remove_index_vector = np.random.choice(
                            batch_class_index, remove_count, replace=False)
                        add_index_vector_len = add_index_vector_len - len(
                            remove_index_vector)
                        batch_x_matrix = np.delete(batch_x_matrix,
                                                   remove_index_vector,
                                                   axis=0)
                        batch_y_matrix = np.delete(batch_y_matrix,
                                                   remove_index_vector,
                                                   axis=0)
                        batch_y_vector = np.argmax(batch_y_matrix, axis=1)
                        batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(
                            batch_y_vector)
                coefficients_vector.append(float(add_index_vector_len))
            #print "End of F1"

            coefficients_vector = np.array(coefficients_vector)
            batch_max_len = float(max(coefficients_vector))
            coefficients_vector = batch_max_len / coefficients_vector
            if i < 3:
                batch_y_vector = np.argmax(batch_y_matrix, axis=1)
                batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(
                    batch_y_vector)
                logger.info("class index after: ")
                logger.info(batch_class_index_dict)
                logger.info("coefficient vector: ")
                logger.info(coefficients_vector)

            start_time = time.time()
            train_step.run(
                feed_dict={
                    input_x_placeholder: batch_x_matrix,
                    output_y_placeholder: batch_y_matrix,
                    coefficient_placeholder: coefficients_vector,
                    keep_prob: 1
                })
            train_run_time = train_run_time + time.time() - start_time
        else:
            start_time = time.time()
            train_step.run(
                feed_dict={
                    input_x_placeholder: batch_x_matrix,
                    output_y_placeholder: batch_y_matrix,
                    keep_prob: 1
                })
            train_run_time = train_run_time + time.time() - start_time
        if i % 100 == 0:
            fir_weight_variable = tf.get_default_graph().get_tensor_by_name(
                "conv_w_0:0")
            logger.info("fir weight")
            logger.info(fir_weight_variable.get_shape())
            fir_weight_var_val = cnn_session.run(fir_weight_variable)
            logger.info(fir_weight_var_val[0, 0:5, 0, 0])
            test_eval_value = eval_method_value.eval(
                feed_dict={
                    input_x_placeholder: test_x_matrix,
                    output_y_placeholder: test_y_matrix,
                    keep_prob: 1
                })
            if str(test_eval_value) == 'nan':
                test_eval_value = 0
            print_str = "step " + str(
                i) + ", testing " + eval_method_keyword + ": " + str(
                    test_eval_value)
            logger.info(print_str)
            if best_eval_value < test_eval_value:
                # Save the variables to disk.
                best_eval_value = test_eval_value
                save_path = saver.save(cnn_session, saver_file)
                print_str = "Model saved in file: " + save_path + ' at iteration: ' + str(
                    i)
                logger.info(print_str)

        i = i + 1
        start = end
        end = end + batch_size
        if epoch > max_iter:
            logger.info("best eval value at epoch: " + str(epoch))
            logger.info("best eval value to break")
            logger.info(best_eval_value)
            break

    start_time = time.time()
    test_eval_value = eval_method_value.eval(
        feed_dict={
            input_x_placeholder: test_x_matrix,
            output_y_placeholder: test_y_matrix,
            keep_prob: 1
        })
    test_run_time = time.time() - start_time
    if test_eval_value < best_eval_value:
        cnn_session.close()
        cnn_session = tf.InteractiveSession()
        saver.restore(cnn_session, saver_file)
    else:
        best_eval_value = test_eval_value

    #if best_eval_value == 0:
    #    return
    logger.info("Running iteration: %d" % (i))
    logger.info("final best " + eval_method_keyword + ": " +
                str(best_eval_value))
    logger.info(f1_unbalance_count)

    cnn_predict_proba = cnn_session.run(predict_y_proba,
                                        feed_dict={
                                            input_x_placeholder: test_x_matrix,
                                            keep_prob: 1.0
                                        })
    logger.info("CNN model saved: " + str(saver_file))

    if cnn_setting.feature_method == 'none':
        cnn_session.close()
        return best_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, ''

    #keeped_feature_value_list = []
    logger.info("feature value generation")
    #for feature_placeholder in keeped_feature_list:
    #    feature_value = feature_placeholder.eval(feed_dict={input_x_placeholder: train_x_matrix, keep_prob: 1.0})
    #    keeped_feature_value_list.append(feature_value)
    #    logger.info(feature_value.shape)
    test_keeped_feature_value_list = cnn_session.run(keeped_feature_list,
                                                     feed_dict={
                                                         input_x_placeholder:
                                                         test_x_matrix,
                                                         keep_prob: 1.0
                                                     })
    logger.info('test feature list ready')
    start = 0
    end = 0
    train_row = len(train_x_matrix)
    train_obj_list = []
    while (start < train_row):
        logger.info(start)
        end = start + 1000
        if end > train_row:
            end = train_row
        keep_obj = cnn_session.run(keeped_feature_list[0],
                                   feed_dict={
                                       input_x_placeholder:
                                       train_x_matrix[start:end, :, :, :],
                                       keep_prob:
                                       1.0
                                   })
        train_obj_list.append(keep_obj)
        start = end
    #keeped_feature_value_list = cnn_session.run(keeped_feature_list, feed_dict={input_x_placeholder: train_x_matrix, keep_prob: 1.0})
    logger.info('train feature list ready')
    logger.info(
        "The order of feature value list: fir_out_conv_no_act, fir_out_conv, fir_weight, fir_bias, last_conv, weight_full, bias_full"
    )
    logger.info("All features saved to ")
    logger.info("CNN feature list saved to: " + feature_obj_file)
    save_obj([train_obj_list, test_keeped_feature_value_list],
             feature_obj_file)
    cnn_session.close()
    return best_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_obj_file
Ejemplo n.º 9
0
def run_pure_pv_evaluation(
        file_keyword,
        parameter_file='../../parameters/pv_baseline_evaluation.txt',
        function_keyword="pure_pv_evaluation"):
    data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, method, log_folder, out_obj_folder = read_pure_feature_generation(
        parameter_file, function_keyword)

    print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, method, log_folder, out_obj_folder

    file_list = list_files(data_folder)

    file_count = 0
    for train_file in file_list:
        if file_keyword not in train_file:
            continue
        train_key = train_file.replace('.txt', '')
        file_count = file_count + 1

        data_matrix, attr_num = file_reading(data_folder + train_file)
        train_x_matrix, train_y_vector = x_y_spliting(data_matrix,
                                                      class_column)
        train_row, train_col = train_x_matrix.shape
        train_x_matrix = train_x_matrix.reshape(train_row, attr_num, attr_len)
        if class_id < 0:
            min_class = min(train_y_vector)
            max_class = max(train_y_vector) + 1
        else:
            min_class = class_id
            max_class = min_class + 1
        log_file = train_key + "_" + method + "_min" + str(
            min_class) + "_max" + str(max_class) + "_pure_projected.log"

        #logger = setup_logger('')
        logger = setup_logger(log_folder + log_file)
        print "log file: " + log_folder + log_file
        logger.info(train_file)
        out_obj_file = train_key + "_" + method + "_min" + str(
            min_class) + "_max" + str(max_class) + "_pure_projected.obj"
        out_obj_matrix = []
        logger.info("min class: " + str(min_class))
        logger.info("max class: " + str(max_class))
        for label in range(min_class, max_class):
            class_train_y = np.where(train_y_vector == label, 1, 0)
            logger.info("label: " + str(label))
            if method == 'rf_lda':
                class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_lda_analysis(
                    train_x_matrix, class_train_y, logger)
            elif method == "rf":
                class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_analysis(
                    train_x_matrix, class_train_y, logger)
            elif method == "lda":
                class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_lda_analysis(
                    train_x_matrix, class_train_y, logger)
            logger.info("class attr imp matrix shape: " +
                        str(class_attr_imp_matrix.shape))
            class_attr_list = map_attr_imp_analysis(class_attr_imp_matrix,
                                                    logger)
            logger.info(class_attr_list)
            logger.info(class_attr_list.shape)
            out_obj_matrix.append(class_attr_list)

        out_obj_matrix = np.array(out_obj_matrix)
        logger.info("out obj to: " + out_obj_folder + out_obj_file)
        logger.info(out_obj_matrix.shape)
        save_obj([out_obj_matrix], out_obj_folder + out_obj_file)
Ejemplo n.º 10
0
def run_lda_proj_feature_main(data_folder,
                              class_column,
                              attr_num,
                              num_classes,
                              lda_proj_obj_file,
                              transpose=False,
                              logger=None):
    if logger == None:
        logger = init_logging('')

    file_list = listFiles(data_folder)
    overall_time = 0

    ret_lda_feature_array = []
    ret_lda_feature_weight = []

    file_count = 0
    lda_time = 0
    norm_time = 0
    for train_file in file_list:
        if "train" not in train_file:
            continue
        logger.info(train_file)

        file_count = file_count + 1

        lda_feature_array = []

        x_matrix, y_vector = readFile(data_folder + train_file, class_column)
        #x_matrix = x_matrix[0:100, :]
        #y_vector = y_vector[0:100]

        row_num, col_num = x_matrix.shape
        logger.info(x_matrix.shape)
        attr_len = col_num / attr_num
        if transpose == True:
            x_matrix_transpose = []
            x_matrix = x_matrix.reshape(row_num, attr_num, attr_len)
            for r in range(4, attr_num):
                temp_x_matrix = x_matrix[:, r, :]
                fold_feature_matrix, fold_norm_time, fold_lda_time = gene_projected_lda_feature(
                    temp_x_matrix, y_vector)
                print fold_feature_matrix
                sdfsd
                break
        else:
            start_time = time.time()
            fold_feature_matrix, fold_norm_time, fold_lda_time = gene_projected_lda_feature(
                x_matrix, y_vector)
            overall_time = overall_time + time.time() - start_time

        logger.info("fold norm: " + str(fold_norm_time))
        logger.info("fold lda: " + str(fold_lda_time))
        norm_time = fold_norm_time + norm_time
        lda_time = fold_lda_time + lda_time

        f_row_num, f_col_num = fold_feature_matrix.shape
        fold_feature_array = np.zeros((f_row_num, attr_num))
        fold_feature_weight_array = np.zeros((f_row_num, attr_num))
        logger.info(fold_feature_array.shape)
        for i in range(0, f_row_num):
            temp_vector = np.zeros(attr_num)
            for j in range(0, f_col_num):
                attr_index = j / attr_len
                temp_vector[attr_index] = temp_vector[
                    attr_index] + fold_feature_matrix[i, j]

            fold_feature_weight_array[i, :] = temp_vector
            fold_feature_array[i, :] = argsort(temp_vector)[::-1]

        ret_lda_feature_array.append(fold_feature_array)
        ret_lda_feature_weight.append(fold_feature_weight_array)

    logger.info("overall norm: " + str(norm_time))
    logger.info("overall lda: " + str(lda_time))
    ret_lda_feature_weight = np.array(ret_lda_feature_weight)
    ret_lda_feature_array = np.array(ret_lda_feature_array)
    logger.info(ret_lda_feature_array.shape)

    ret_lda_feature_weight = np.sum(ret_lda_feature_weight, axis=0)

    ret_lda_feature_array = ret_lda_feature_array.astype(int)
    combine_time = 0
    start_time = time.time()
    lda_feature_array = fold_feature_combination_F_C_A(ret_lda_feature_array)
    combine_time = time.time() - start_time
    overall_time = overall_time + combine_time

    logger.info("combine lda: " + str(overall_time))
    logger.info(lda_feature_array.shape)
    logger.info(lda_feature_array[0:7, 0:7])
    logger.info(ret_lda_feature_weight[0:7, 0:7])
    logger.info("pure lda projected feature generation overall time (sec)")
    logger.info(overall_time)
    save_obj([lda_feature_array, ret_lda_feature_weight], lda_proj_obj_file)
    logger.info("Object saved to " + lda_proj_obj_file)

    return lda_feature_array
Ejemplo n.º 11
0
def run_pca_proj_feature_main(data_folder,
                              class_column,
                              attr_num,
                              num_classes,
                              pca_proj_obj_file,
                              transpose=False,
                              logger=None):
    if logger == None:
        logger = init_logging('')
    ret_pca_feature_array = []
    overall_time = 0

    file_list = listFiles(data_folder)

    file_count = 0
    for train_file in file_list:
        if "train" not in train_file:
            continue
        logger.info(train_file)

        file_count = file_count + 1
        #if file_count > 2:
        #    break
        pca_feature_array = []

        x_matrix, y_vector = readFile(data_folder + train_file, class_column)
        row_num, col_num = x_matrix.shape
        attr_len = col_num / attr_num
        y_vector = y_vector.astype(int)
        start_class = min(y_vector)
        d3_data_matrix = x_matrix.reshape(row_num, attr_num, attr_len)

        for i in range(0, num_classes):
            class_label = i + start_class
            logger.info("calss label: " + str(i))
            #print "class: "+ str(class_label)
            class_index = np.where(y_vector == class_label)[0]
            class_data_matrix = d3_data_matrix[class_index, :, :]
            #print class_data_matrix.shape
            start_time = time.time()
            class_im_index, class_im_vector = run_pca_proj_feature_3D(
                class_data_matrix, )
            overall_time = overall_time + time.time() - start_time
            logger.info(class_im_index.shape)
            pca_feature_array.append(class_im_index)
        pca_feature_array = np.array(pca_feature_array)
        #print pca_feature_array.shape
        logger.info(pca_feature_array.shape)
        logger.info("end of " + train_file)
        ret_pca_feature_array.append(pca_feature_array)

    logger.info("Final:")
    ret_pca_feature_array = np.array(ret_pca_feature_array)
    logger.info(ret_pca_feature_array.shape)
    start_time = time.time()
    feature_array = fold_feature_combination_F_C_A(ret_pca_feature_array)
    overall_time = overall_time + time.time() - start_time
    #print feature_array.shape
    logger.info(feature_array.shape)
    logger.info(feature_array[0:3, 0:5])
    logger.info("Object saved to " + pca_proj_obj_file)
    logger.info("Overall time (sec): ")
    logger.info(str(overall_time))
    #print feature_array
    save_obj([feature_array], pca_proj_obj_file)

    return pca_feature_array
Ejemplo n.º 12
0
def pv_cnn_generation_main(parameter_file,
                           file_keyword,
                           function_keyword="pv_cnn_generation"):
    data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_pv_cnn_generation(
        parameter_file, function_keyword)

    print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file

    log_folder = init_folder(log_folder)
    out_obj_folder = init_folder(out_obj_folder)
    out_model_folder = init_folder(out_model_folder)

    data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len,
                                 class_column)

    file_list = list_files(data_folder)
    obj_list = list_files(obj_folder)
    file_count = 0

    class_column = 0
    header = True

    cnn_setting = return_cnn_setting_from_file(cnn_setting_file)
    cnn_setting.out_obj_folder = out_obj_folder
    cnn_setting.out_model_folder = out_model_folder
    cnn_setting.feature_method = 'save'
    cnn_setting.eval_method = 'f1'
    init_folder(out_obj_folder)
    init_folder(out_model_folder)

    result_obj_folder = obj_folder + method + "_result_folder"
    result_obj_folder = init_folder(result_obj_folder)

    delimiter = ' '
    loop_count = -1
    for train_file in file_list:
        if file_keyword not in train_file:
            continue
        loop_count = loop_count + 1
        file_key = train_file.replace('.txt', '')
        log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(
            class_id) + '_' + method + '.log'

        print "log file: " + log_file

        logger = setup_logger(log_file, 'logger_' + str(loop_count))
        logger.info('\nlog file: ' + log_file)
        logger.info(train_file)
        #logger.info('cnn setting:\n ' + cnn_setting.to_string())
        logger.info('method: ' + method)
        logger.info('============')

        test_file = train_file.replace('train', 'test')

        train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum(
            data_folder + train_file, data_folder + test_file, class_column,
            delimiter, header)
        if file_count == 0:
            logger.info('train matrix shape: ' + str(train_x_matrix.shape))
            logger.info('train label shape: ' + str(train_y_vector.shape))
            logger.info('test matrix shape: ' + str(test_x_matrix.shape))
            logger.info('test label shape: ' + str(test_y_vector.shape))

        train_x_matrix = train_test_transpose(train_x_matrix, attr_num,
                                              attr_len, False)
        test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len,
                                             False)
        # Call the projected feature function here, just need to set feature_dict = None
        feature_dict = None
        top_k = -1
        model_save_file = file_key + '_count' + str(file_count) + '_' + method

        if method == 'fcn':
            fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_ijcnn_fcn(
                train_x_matrix, train_y_vector, test_x_matrix, test_y_vector,
                data_stru, cnn_setting, feature_dict, top_k, model_save_file,
                class_id, logger)
        else:
            fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn(
                train_x_matrix, train_y_vector, test_x_matrix, test_y_vector,
                data_stru, cnn_setting, feature_dict, top_k, model_save_file,
                class_id, logger)

        logger.info("Fold F1: " + str(fold_f1_value))
        logger.info(method + ' fold training time (sec):' +
                    str(fold_train_time))
        logger.info(method + ' fold testing time (sec):' + str(fold_test_time))
        logger.info(method + ' fold accuracy: ' + str(fold_accuracy))
        logger.info("save obj to " + result_obj_folder + file_key +
                    "_all_feature_" + method + "_result.ckpt")
        save_obj([
            fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time,
            fold_test_time, fold_predict_matrix
        ], result_obj_folder + file_key + "_all_feature_" + method +
                 "_result.ckpt")
Ejemplo n.º 13
0
def run_load_predict_cnn(fold_keyword, model_saved_folder, feature_array, top_k, test_x_matrix, test_y_vector, data_stru, cnn_setting, group_all=True, save_obj_folder="./", logger=None):
    if logger is None:
        logger = init_logging('')
    
    real_num_classes = data_stru.num_classes
    model_list = list_files(model_saved_folder)
    data_stru.num_classes = 2
    
    load_time = 0
    test_time = 0
    multi_predict = []
    for c in range(real_num_classes):
        logger.info("Class: " + str(c))
        class_keyword = "class" + str(c) + "_"
        found_model_file = ""
        for model_file in model_list:
            if ".index" not in model_file:
                continue
            if fold_keyword not in model_file:
                continue
            if class_keyword not in model_file:
                continue
            found_model_file = model_file.replace(".index", "")
            print (found_model_file)
            break
    
        if found_model_file == "":
            raise Exception("Model for " + class_keyword + " and " + fold_keyword + " Not Found!!!")
        else:
            found_model_file = model_saved_folder + found_model_file
        class_feature = feature_array[c]
        class_feature = class_feature[0:top_k]
        logger.info("model file: " + str(model_saved_folder + found_model_file))
        logger.info("feature list: " + str(class_feature))
        
        temp_test_x_matrix = test_x_matrix[:, :, class_feature, :]
        logger.info("In run_load_predict_cnn: " + str(temp_test_x_matrix.shape))
        start_time = time.time()
        cnn_session, predict_y_proba, train_x_placeholder, keep_prob_placeholder = load_model(found_model_file, data_stru, cnn_setting, group_all, logger)
        load_time = load_time + time.time() - start_time
        start_time = time.time()
        cnn_predict_proba = load_model_predict(cnn_session, temp_test_x_matrix, predict_y_proba, train_x_placeholder, keep_prob_placeholder)
        #print (cnn_predict_proba[0:10, :])
        test_time = test_time + time.time() - start_time
        multi_predict.append(cnn_predict_proba[:, 1])
        cnn_session.close()
    
    multi_predict = np.array(multi_predict)
    #print multi_predict[0:2, 5:11]
    multi_predict_vector = np.argmax(multi_predict, axis=0)
    save_obj_file = save_obj_folder + fold_keyword + "_" + str(top_k) + ".out"
    save_obj([multi_predict], save_obj_file)
    logger.info("output obj saved to: " + save_obj_file)
    logger.info("multi predict matrix shape: " + str(multi_predict.shape))
    logger.info("multi predict vector shape: " + str(multi_predict_vector.shape))
    #print (str(multi_predict_vector[0:10]))
    logger.info("test y vector: " + str(test_y_vector.shape))
    #print (str(test_y_vector[0:10]))
    acc = accuracy_score(test_y_vector, multi_predict_vector)
    data_stru.num_classes = real_num_classes
    acc1, f1_list = multiple_f1_value_precision_recall_accuracy(multi_predict_vector, test_y_vector, logger)
    if acc != acc1:
        raise Exception("check accuracy")
    return acc, f1_list, load_time, test_time
Ejemplo n.º 14
0
def mask_evaluation_main(log_folder,
                         obj_folder,
                         out_obj_folder,
                         obj_keyword,
                         shap_k=-1,
                         shap_min=-1,
                         shap_max=-1,
                         func_key="arxiv_mask_gene"):
    log_folder = log_folder + func_key
    log_folder = init_folder(log_folder)
    log_file = obj_keyword + "_allclass_" + func_key + ".log"
    #logger = setup_logger('')
    logger = setup_logger(log_folder + log_file)
    logger.info("log folder: " + log_folder)
    logger.info("obj folder: " + obj_folder)
    obj_file_list = list_files(obj_folder)

    if shap_k != -1:
        obj_sec_key = "shapNum" + str(shap_k) + "_shapMin" + str(
            shap_min) + "_shapMax" + str(shap_max)
    else:
        obj_sec_key = ".obj"
    min_class = 100
    max_class = -1
    output_array = []

    for obj_file in obj_file_list:
        if obj_keyword not in obj_file:
            continue
        if "_class" not in obj_file:
            continue
        if obj_sec_key not in obj_file:
            continue
        class_key = obj_file.split('_')[-1]
        class_key = class_key.replace('class', '').replace('.obj', '')
        logger.info("obj file:" + obj_file)
        logger.info("class key: " + class_key)
        class_key = int(class_key)
        if min_class > class_key:
            min_class = class_key
        if max_class < class_key:
            max_class = class_key
        shap_mask = load_obj(obj_folder + obj_file)[0]
        if len(shap_mask) == 0:
            continue
        shap_mask = numpy.array(shap_mask)
        shap_mask = numpy.squeeze(shap_mask)
        logger.info("shap_mask shape: " + str(shap_mask.shape))
        #shap_num, attr_num = shap_mask.shape

        shap_mask = numpy.absolute(shap_mask)
        shap_mask = numpy.sum(shap_mask, axis=0)
        logger.info(shap_mask)
        sort_index = numpy.argsort(shap_mask)
        imp_value = 0
        norm_imp = numpy.zeros(len(shap_mask))
        for index in sort_index:
            norm_imp[index] = imp_value
            imp_value = imp_value + 1
        shap_mask_index = numpy.argsort(norm_imp)[::-1]
        logger.info(shap_mask_index)
        logger.info("====")
        output_array.append(shap_mask_index)
        logger.info("shap_mask final shape: " + str(shap_mask.shape))
    output_array = numpy.array(output_array)
    obj_file = obj_keyword + "_min" + str(min_class) + "_max" + str(
        max_class) + "out.obj"
    logger.info("final output obj shape: " + str(output_array.shape))
    logger.info(output_array)
    save_obj([output_array], out_obj_folder + obj_file)
Ejemplo n.º 15
0
def run_channel_mask_main(data_folder,
                          log_folder,
                          obj_folder,
                          shap_k=10,
                          shap_min=2,
                          shap_max=3,
                          file_key="train_",
                          fun_key="_mask_gene"):
    file_list = list_files(data_folder)
    file_count = 0
    for train_file in file_list:
        if file_key not in train_file:
            continue
        this_keyword = train_file.replace('.txt', '')
        log_file = this_keyword + fun_key + "_shapNum" + str(
            shap_k) + "_shapMin" + str(shap_min) + "_shapMax" + str(
                shap_max) + "_all_class.log"
        out_obj_file = this_keyword + fun_key + "_shapNum" + str(
            shap_k) + "_shapMin" + str(shap_min) + "_shapMax" + str(shap_max)
        logger = setup_logger(log_folder + log_file)
        print "log file: " + log_folder + log_file
        print "obj file: " + obj_folder + out_obj_file
        logger.info(log_folder + log_file)
        out_obj_dict = {}
        file_count = file_count + 1
        test_file = train_file.replace('train_', 'test_')
        train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum(
            data_folder + train_file, data_folder + test_file)

        train_row, train_col = train_x_matrix.shape
        test_row, test_col = test_x_matrix.shape
        attr_len = train_col / attr_num
        train_x_matrix = train_x_matrix.reshape(train_row, attr_num, attr_len)
        test_x_matrix = test_x_matrix.reshape(test_row, attr_num, attr_len)
        logger.info("train x matrix: " + str(train_x_matrix.shape))
        logger.info("test x matrix: " + str(test_x_matrix.shape))

        train_keep_len = matrix_keep_len_gene(train_x_matrix)
        test_keep_len = matrix_keep_len_gene(test_x_matrix)

        min_class = min(train_y_vector)
        max_class = max(train_y_vector) + 1
        num_classes = max_class - min_class
        logger.info("x matrix tran after shape: " + str(train_x_matrix.shape))
        for label in range(min_class, max_class):
            label = max_class - label - 1
            label_train_y_vector = np.where(train_y_vector == label, 1, 0)
            label_test_y_vector = np.where(test_y_vector == label, 1, 0)
            label_train_y_matrix = y_vector_to_matrix(label_train_y_vector, 2)
            label_test_y_matrix = y_vector_to_matrix(label_test_y_vector, 2)
            logger.info("class: " + str(label))
            test_eval_value, mask_value = run_channel_mask(
                train_x_matrix, label_train_y_matrix, train_keep_len,
                test_x_matrix, label_test_y_matrix, test_keep_len, shap_k,
                shap_min, shap_max, logger)
            logger.info("final for class " + str(label))
            logger.info("final acc: " + str(test_eval_value))
            logger.info("final mask: " + str(mask_value.shape))
            logger.info("out obj saved to " + obj_folder + out_obj_file +
                        "_class" + str(label) + ".obj")
            save_obj([mask_value], obj_folder + out_obj_file + "_class" +
                     str(label) + ".obj")
Ejemplo n.º 16
0
def global_classification_main(parameter_file, file_keyword):
    function_keyword = "global_classification"
    data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification(
        parameter_file, function_keyword)

    data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len,
                                 class_column)

    file_list = list_files(data_folder)
    obj_list = list_files(obj_folder)
    file_count = 0

    class_column = 0
    header = True

    cnn_setting = return_cnn_setting_from_file(cnn_setting_file)
    cnn_setting.save_obj_folder = cnn_obj_folder
    cnn_setting.temp_obj_folder = cnn_temp_folder
    cnn_setting.eval_method = 'f1'
    init_folder(cnn_obj_folder)
    init_folder(cnn_temp_folder)

    all_result_matrix = np.zeros((10, num_classes))

    train_file_vector = []
    prediction_matrix = []
    f1_value_matrix = []
    accuracy_vector = []
    delimiter = ' '
    all_accuracy = 0
    all_train_time = 0
    all_test_time = 0
    loop_count = -1
    for train_file in file_list:
        if file_keyword not in train_file:
            continue
        loop_count = loop_count + 1
        file_key = train_file.replace('.txt', '')
        log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(
            class_id) + '_top' + str(top_k) + '_' + method + '.log'

        print "log file: " + log_file

        logger = setup_logger(log_file, 'logger_' + str(loop_count))
        logger.info('\nlog file: ' + log_file)
        logger.info(train_file)
        logger.info('cnn setting:\n ' + cnn_setting.to_string())
        logger.info('method: ' + method)
        logger.info('============')
        continue
        found_obj_file = ''
        for obj_file in obj_list:
            if file_key in obj_file:
                found_obj_file = obj_file
                break
        if found_obj_file == '':
            raise Exception('No obj file found')

        print found_obj_file
        print cnn_setting.save_obj_folder + file_key + "_" + method + "_projected_result.ckpt"
        #
        found_obj_file = obj_folder + found_obj_file

        feature_dict = load_obj(found_obj_file)[0]
        feature_dict = np.array(feature_dict)
        logger.info("feature array shape: " + str(feature_dict.shape))

        test_file = train_file.replace('train', 'test')

        train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum(
            data_folder + train_file, data_folder + test_file, class_column,
            delimiter, header)

        if file_count == 0:
            logger.info('train matrix shape: ' + str(train_x_matrix.shape))
            logger.info('train label shape: ' + str(train_y_vector.shape))
            logger.info('test matrix shape: ' + str(test_x_matrix.shape))
            logger.info('test label shape: ' + str(test_y_vector.shape))

        train_x_matrix = train_test_transpose(train_x_matrix, attr_num,
                                              attr_len, False)
        test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len,
                                             False)
        data_stru.attr_num = top_k
        fold_accuracy, fold_avg_eval, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn(
            train_x_matrix, train_y_vector, test_x_matrix, test_y_vector,
            data_stru, cnn_setting, feature_dict, top_k,
            file_key + '_count' + str(file_count), class_id, logger)

        prediction_matrix.append(fold_predict_y)
        logger.info("Fold F1: " + str(fold_f1_value_list))
        accuracy_vector.append(fold_accuracy)
        all_accuracy = all_accuracy + fold_accuracy
        all_train_time = all_train_time + fold_train_time
        all_test_time = all_test_time + fold_test_time
        logger.info(method + ' fold accuracy: ' + str(fold_accuracy))
        logger.info(method + ' fold training time (sec):' +
                    str(fold_train_time))
        logger.info(method + ' fold testing time (sec):' + str(fold_test_time))
        save_obj([
            fold_accuracy, fold_avg_eval, fold_predict_y, fold_train_time,
            fold_test_time, fold_predict_matrix
        ], save_obj_folder + file_key + "_" + method +
                 "_global_cnn_result.ckpt")