def load_custom_dataset(dataset_name, path=None):
    if dataset_name == "20ng":
        arff_path = path if path else "./datasets/20NG-F.arff"
        n_labels = 20
        label_location = "start"
        arff_file_is_sparse = False
        x_mulan, y_mulan, feature_names, label_names = load_from_arff(
            arff_path,
            n_labels,
            label_location=label_location,
            load_sparse=arff_file_is_sparse,
            return_attribute_definitions=True)
        return x_mulan, y_mulan, feature_names, label_names
    if dataset_name == "test":
        arff_path = path if path else "./datasets/test.arff"
        n_labels = 5
        label_location = "end"
        arff_file_is_sparse = False
        x, y, feature_names, label_names = load_from_arff(
            arff_path,
            n_labels,
            label_location=label_location,
            load_sparse=arff_file_is_sparse,
            return_attribute_definitions=True)
        return x, y, feature_names, label_names
    def load_dataset(self, which_set):
        if self.verbose:
            sys.stdout.write("Reading data...")

        if which_set not in {'train', 'test', 'full'}:
            raise ValueError('Unrecognized `which_set` value "%s". ' %
                             (which_set, ) +
                             'Valid values are ["train", "test", "full"].')

        # datapath = os.path.join(self.datadir, which_set + '.pkl.gz')
        # dataset = pkl.load(gzip.open(datapath))

        datapath = os.path.join(self.datadir, which_set + '.arff')
        dataset = load_from_arff(
            datapath,
            # number of labels
            labelcount=self.n_labels,
            # MULAN format, labels at the end of rows in arff data
            endian='little',
            # bag of words
            input_feature_type='int',
            encode_nominal=False,
            # sometimes the sparse ARFF loader is borked, like in delicious,
            # scikit-multilearn converts the loaded data to sparse representations,
            # so disabling the liac-arff sparse loader
            load_sparse=False,
            # this decides whether to return attribute names or not, usually
            # you don't need this
            return_attribute_definitions=False)

        if self.verbose:
            sys.stdout.write("Done.\n")

        return dataset
Exemple #3
0
def prepare_medical_dataset(filename):
    data = load_from_arff(filename, label_count=45, load_sparse=False, return_attribute_definitions=True)
    cols_X = [i[0] for i in data[2]]
    cols_Y = [i[0] for i in data[3]]
    X_med_df = pd.DataFrame(data[0].todense(), columns=cols_X)
    y_med_df = pd.DataFrame(data[1].todense(), columns=cols_Y)
    df = pd.concat([X_med_df, y_med_df], 1)

    return df, cols_Y
Exemple #4
0
def load_dataset(dataset_name):
    logger = logging.getLogger()
    path_train = '../data/mulan/{}/{}-train.arff'.format(
        dataset_name, dataset_name)
    path_test = '../data/mulan/{}/{}-test.arff'.format(dataset_name,
                                                       dataset_name)
    if (not path.exists(path_train)) or (not path.exists(path_test)):
        logger.debug('data set \"{}\" not found.'.format(dataset_name))
        return None, None, None, None
    X_train, y_train = dataset.load_from_arff(path_train,
                                              labelcount=14,
                                              endian="little")
    logger.debug(X_train)
    X_test, y_test = dataset.load_from_arff(path_test,
                                            labelcount=14,
                                            endian="little")
    return X_train.toarray(), y_train.toarray(), X_test.toarray(
    ), y_test.toarray()
Exemple #5
0
 def inner_load(path):
     return load_from_arff(
         path,
         # number of labels
         label_count=n_labels,
         # MULAN format, labels at the end of rows in arff data
         label_location=label_location,
         # bag of words
         input_feature_type=input_feature_type,
         encode_nominal=False,
         # sometimes the sparse ARFF loader is borked, like in delicious,
         # scikit-multilearn converts the loaded data to sparse representations,
         # so disabling the liac-arff sparse loader
         load_sparse=sparse,
         # this decides whether to return attribute names or not, usually
         # you don't need this
         return_attribute_definitions=False)
def load_moa_stream(filepath, labels):
    print("Reading original arff from path")
    with open(filepath) as arff_file:
        arff_file_content = [line.rstrip(",\n") + "\n" for line in arff_file]
    filename = "stream_{}".format(str(uuid.uuid1()))
    tmp_file = "/tmp/{}".format(filename)
    with open(tmp_file, "w") as opened:
        opened.write("".join(arff_file_content))
    del arff_file_content
    print("Reading original arff from tmp")
    arff_path = tmp_file
    label_location = "start"
    arff_file_is_sparse = False
    return load_from_arff(arff_path,
                          labels,
                          label_location=label_location,
                          load_sparse=arff_file_is_sparse,
                          return_attribute_definitions=True)
def load_txt_data(dataset_name, walks, folds, rng):
    svd = TruncatedSVD(n_components=300, n_iter=10, random_state=0)
    std_scaler = Normalizer()
    tfidf = TfidfTransformer()
    if dataset_name == "tmc2007":
        labelcount = open(
            "../multi_label_dataset/tmc2007/tmc2007.xml").read().count("class")
        features, Y = dataset_load.load_from_arff(
            "../multi_label_dataset/tmc2007/tmc2007.arff",
            labelcount=labelcount,
            endian="little",
            load_sparse=True)
        features, Y = preprocess_data(features, Y)
        features = tfidf.fit_transform(features)
        features = svd.fit_transform(features)
        features = std_scaler.fit_transform(features)

    elif "yahoo" in dataset_name:
        labelcount = open("../multi_label_dataset/yahoo/{}.xml".format(
            dataset_name.split("-")[1])).read().count("Label")
        features, Y = dataset_load.load_from_arff(
            "../multi_label_dataset/yahoo/{}.arff".format(
                dataset_name.split("-")[1].title()),
            labelcount=labelcount,
            endian="little",
            load_sparse=True)
        features, Y = preprocess_data(features, Y)
        features = tfidf.fit_transform(features)
        features = svd.fit_transform(features)
        features = std_scaler.fit_transform(features)
    else:
        print("unknown dataset")
        exit()

    X = []
    y = []
    Y = Y.toarray()
    for I in range(len(walks)):
        X.append(walks[str(I)])
        y.append(Y[I])

    X = np.array(
        X)  # np.ones((len(docs), max_sentences, maxlen), dtype=np.int64) * -1
    y = np.array(y)

    # X = scaler.fit_transform(pca.fit_transform(X.toarray()))
    # X = select_norm_count(X, mindf=0.03, maxdf=0.8, normalise = True)
    # elif dataset_name == "delve":
    #	 X, Y = load_delve(rng, comb_type=comb_type, add_validation_set=add_validation_set, add_Text=add_Text,
    #					   is_cv=is_cv, d_size=d_size)

    index = np.arange(np.shape(X)[0])
    unlabeled = np.where(Y.sum(-1) < 0)[0]
    labeled_idx = np.setdiff1d(index, unlabeled)
    print("Features shape = {}".format(X.shape))
    print("Label shape = {}".format(Y.shape))
    dataset = dataset_name.split("-")[1]
    folds, _, y, blacklist_samples = iterative_sampling(
        y, labeled_idx, folds, rng, dataset)
    sel_samples = np.setdiff1d(index, blacklist_samples)
    print(sel_samples.shape)
    return (X, y, folds, features)
def main(**kwargs):
    """
    main function for problem transformation algorithms
    :param kwargs: includes path_to_data,feature_type, num_labels, algorithm
    :return:
    """
    path = kwargs.pop('path_to_data')
    feature_type = kwargs.pop('feature_type')
    num_labels = kwargs.pop('num_labels')
    algorithm = kwargs.pop('algorithm')

    X, y = load_from_arff(path, label_count=num_labels, label_location="end", load_sparse=not feature_type)

    X = X.toarray()
    y = y.toarray()

    features_list, target_list = construct_cv_folds(5, X, y)
    accuracy = []
    ham_score = []
    precision = []
    recall = []
    f1_score = []

    if feature_type == 1:
        for i in range(len(X[0])):
            X[:, i] = pre_process_continuous(X[:, i], 5)

    for i in range(5):
        training_set_features = []
        training_set_target = []
        for j in range(5):
            if i != j:
                training_set_features = training_set_features + features_list[j]
                training_set_target = training_set_target + target_list[j]

        X_train = np.array(training_set_features)
        y_train = np.array(training_set_target)
        X_test = np.array(features_list[i])
        y_test = np.array(target_list[i])

        print(i)
        params = {'feature_type': feature_type, 'num_bins': 5}
        # Training and Testing
        if feature_type == 0:
            if algorithm == "BR":
                br = BinaryRelevanceNB(nbayes, nbayes_prediction, params)
                br.fit(X_train, y_train)
                predictions = br.predict(X_test)

            else:
                cc = ClassifierChainsNB(nbayes, nbayes_prediction, params)
                cc.fit(X_train, y_train)
                predictions = cc.predict(X_test)

        else:
            if algorithm == "BR":
                br = BinaryRelevanceNB(nbayes, nbayes_prediction, params)
                br.fit(X_train, y_train)
                predictions = br.predict(X_test)

            else:
                cc = ClassifierChainsNB(nbayes, nbayes_prediction, params)
                cc.fit(X_train, y_train)
                predictions = cc.predict(X_test)
        # print(predictions)
        # print(y_test)

        acc = accuracy_score(y_test, predictions)
        ham = hamming_score(y_test, predictions)
        p, r, f1, _ = precision_recall_fscore_support(y_test, predictions, average='micro')
        accuracy.append(acc)
        ham_score.append(ham)
        precision.append(p)
        recall.append(r)
        f1_score.append(f1)
    print("Accuracy: " + str(sum(accuracy)/5))
    print("Hamming_Score: " + str(sum(ham_score)/5))
    print("Precision: " + str(sum(precision)/5))
    print("Recall: " + str(sum(recall)/5))
    print("F1_Score: " + str(sum(f1_score)/5))
Exemple #9
0
datasets = [
    'yahoo-' + data.split('.')[0]
    for data in os.listdir('../multi_label_dataset/yahoo') if '.xml' in data
]
datasets = [
    data.split('.')[0] for data in os.listdir('../multi_label_dataset/tmc2007')
    if '.xml' in data
]

for dataset_name in datasets:
    if dataset_name == "tmc2007":
        labelcount = open(
            "../multi_label_dataset/tmc2007/tmc2007.xml").read().count("class")
        X, Y = dataset_load.load_from_arff(
            "../multi_label_dataset/tmc2007/tmc2007.arff",
            labelcount=labelcount,
            endian="little",
            load_sparse=True)

        X = tfidf.fit_transform(X)
        X, Y = preprocess_data(X, Y)
        labeled_idx = np.arange(Y.shape[0])
        selection, Y, blacklist_samples = iterative_sampling(
            Y.toarray(), labeled_idx, 5, dataset_name)
        writelabels(Y, selection, dataset_name)
        X = X.toarray()
        X[X <= 0.1] = 0

        S = np.zeros((X.shape[0], X.shape[0]))
        for k in range(X.shape[1]):
            connect_doc = np.where(X[:, k] > 0)[0]