Beispiel #1
0
def train_val_test_split(data_df,number_judges,train_ratio=0.8,val_ratio=0.1,verbose=0,toshuffle=True):
    starttime= time.time()
    sorted_all_data = data_df.sort_values(by='judge_embed_index')
    train_indexes = []
    val_indexes = []
    test_indexes = []
    currentiloc = 0
    for judge_index in range(number_judges):
        if verbose and judge_index%500 == 0:
            print(judge_index,time.time()-starttime)
        
        cases_of_this_judge = sorted_all_data.loc[sorted_all_data['judge_embed_index'] == judge_index]
        number_cases = cases_of_this_judge.shape[0]
        n_of_train = int(number_cases*train_ratio)
        n_of_val = int(number_cases*val_ratio)
        
        nextiloc = currentiloc+number_cases
        
        indexes = [i for i in range(currentiloc, nextiloc)]
        shuffle(indexes)
        
        train_indexes += indexes[:n_of_train]
        val_indexes += indexes[n_of_train:n_of_train+n_of_val]
        test_indexes += indexes[n_of_train+n_of_val:]
        
        currentiloc = nextiloc
    return skshuffle(data_df.loc[train_indexes]),skshuffle(data_df.loc[val_indexes]),skshuffle(data_df.loc[test_indexes])
Beispiel #2
0
def shuffle(X, y=None):
    if isinstance(X, dict):
        to_shuffle = X.values()
        if y is not None:
            to_shuffle.append(y)
        shuffled = skshuffle(*to_shuffle)
        if y is not None:
            return dict(zip(X.keys(), shuffled[:-1])), shuffled[-1]
        else:
            return dict(zip(X.keys(), shuffled))
    else:
        return skshuffle(X, y)
Beispiel #3
0
def sorted_batch_iter(data, batch_size, shuffle=False, random_state=123):
    batch = []
    sorted_data = sorted(data, key=lambda x: len(x[0]), reverse=True)

    for line in sorted_data:
        batch.append(line)
        if len(batch) == batch_size:
            batch = skshuffle(batch, random_state=random_state)
            yield tuple(list(x) for x in zip(*batch))
            # yield batch
            batch = []
    if batch:
        batch = skshuffle(batch, random_state=random_state)
        yield tuple(list(x) for x in zip(*batch))
def get_minibatches(X, mb_size, shuffle=True):
    """
    Generate minibatches from given dataset for training.

    Params:
    -------
    X: np.array of M x 3
        Contains the triplets from dataset. The entities and relations are
        translated to its unique indices.

    mb_size: int
        Size of each minibatch.

    shuffle: bool, default True
        Whether to shuffle the dataset before dividing it into minibatches.

    Returns:
    --------
    mb_iter: generator
        Example usage:
        --------------
        mb_iter = get_minibatches(X_train, mb_size)
        for X_mb in mb_iter:
            // do something with X_mb, the minibatch
    """
    minibatches = []
    X_shuff = np.copy(X)

    if shuffle:
        X_shuff = skshuffle(X_shuff)

    for i in range(0, X_shuff.shape[0], mb_size):
        yield X_shuff[i:i + mb_size]
Beispiel #5
0
def get_data(path='../data', subset=True):

    # Read and merge the data
    df_fake = pd.read_csv(os.path.join(path, 'fake.csv'))
    df_real = pd.read_csv(os.path.join(path, 'real.csv'))
    df_fake['label'] = 1
    df_real['label'] = 0
    df_tot = pd.concat([df_fake, df_real])
    df_tot = skshuffle(df_tot, random_state=42)

    # Add text/title length
    df_tot['text_length'] = df_tot.apply(lambda x: len(x['text'])
                                         if not pd.isnull(x['text']) else -1,
                                         axis=1)
    df_tot['title_length'] = df_tot.apply(lambda x: len(x['title'])
                                          if not pd.isnull(x['title']) else -1,
                                          axis=1)

    if subset:
        # Subset the data
        df_tot = df_tot.dropna(subset=['text'])
        df_tot = df_tot[
            df_tot['text_length'] >= 10].copy()  # as they were nans
        df_tot = df_tot[df_tot['language'] == 'english']

    return df_tot
Beispiel #6
0
    def _evaluate(self, embeddings, labels):
        shuffles = []
        for _ in range(self.num_shuffle):
            shuffles.append(skshuffle(embeddings, labels))
        all_results = defaultdict(list)
        training_percents = [0.1, 0.3, 0.5, 0.7, 0.9]

        for training_percent in training_percents:
            for shuf in shuffles:
                training_size = int(training_percent * self.num_graphs)
                X, y = shuf
                X_train = X[:training_size, :]
                y_train = y[:training_size]

                X_test = X[training_size:, :]
                y_test = y[training_size:]
                # clf = SVC()
                # clf.fit(X_train, y_train)

                params = {"C": [1e-3, 1e-2, 1e-1, 1, 10]}
                svc = SVC()
                clf = GridSearchCV(svc, params)
                clf.fit(X_train, y_train)

                preds = clf.predict(X_test)
                accuracy = f1_score(y_test, preds, average="micro")
                all_results[training_percent].append(accuracy)

        result = dict((
            f"Accuracy {train_percent}",
            sum(all_results[train_percent]) / len(all_results[train_percent]),
        ) for train_percent in sorted(all_results.keys()))
        print("Test Acc: ", list(result.values())[-1])
        return result
Beispiel #7
0
    def clustering_preprocess(self, nu, N, limits=(0, 100000)):
        """ Preprocess the samples before clustering
        
        Preprocesses the list of frequencies at which significant peaks in the 
        power spectrum were found. The binning factors are shuffled to prevent
        clustering along that axis (axis=1). 
        
        The binning factors are scaled to range between 0 and 1. 
        
        Parameters
        ----------
        nu : ndarray
            Frequency of peaks that satisfy the H0 test.
        N : ndarray
            Bin factors at which the significant peaks were selected.
        limits : list
            Lower and upper limits in nu to use for clustering. Samples
            beyond these limits are rejected.
        
        Returns
        -------
        X : ndarray
            Array of samples to be used by HDBscan 
            
        """

        nuidx = (limits[0] < nu) & (nu < limits[1])
        Nscaler = MinMaxScaler().fit(N.reshape(-1, 1))
        Ns = skshuffle(Nscaler.transform(N[nuidx].reshape(-1, 1))).flatten()
        return np.vstack((nu[nuidx], Ns)).T
Beispiel #8
0
def compute_regrets(X, y, shuffle=True, random_state=42):
    regrets = defaultdict(list)
    timings = defaultdict(list)
    if shuffle:
        X, y = skshuffle(X, y, random_state=random_state)

    n_samples, n_features = X.shape
    n_iterations = n_samples - 1
    iterations = np.arange(n_iterations)
    n_classes = int(y.max() + 1)
    classes = np.arange(n_classes)
    classifiers = get_classifiers_online(n_classes)

    for clf_name, clf in classifiers:
        assert hasattr(clf, "partial_fit")
        logging.info("  using %s" % clf_name)
        for i in tqdm(range(1, n_iterations)):
            x_train = X[i - 1].reshape(1, n_features)
            y_train = np.array([y[i - 1]])
            x_test = X[i].reshape(1, n_features)
            y_test = np.array([y[i]])
            t1 = time()
            clf.partial_fit(x_train, y_train, classes)
            t2 = time()
            y_pred = clf.predict_proba(x_test)
            test_loss = log_loss_single(y_test, y_pred)
            regrets[clf_name].append(test_loss)
            timings[clf_name].append(t2 - t1)
        if hasattr(clf, "clear"):
            clf.clear()
        del clf
    return iterations, regrets, timings
Beispiel #9
0
	def next(self):
		diff = self.batch_index + self.batch_size - self.lenght_data
		if(diff == self.batch_size):
			diff=0
			self.batch_index=0
		diff = max(diff,0)

		if(diff > 0):
			#print(self.data[self.batch_index: self.batch_index + self.batch_size])
			x = np.concatenate((self.data[self.batch_index: self.batch_index + self.batch_size], \
													self.data[0: diff]))
			if (self.labels is not None):
				y = np.concatenate((self.labels[self.batch_index: self.batch_index + self.batch_size], \
														self.labels[0:diff]))

			self.batch_index=diff
			if (self.shuffle):
				if (self.labels is None):
					np.random.shuffle(self.data)
				else:
					self.data, self.labels = skshuffle(self.data, self.labels)
		else:
			x = self.data[self.batch_index: self.batch_index + self.batch_size]
			if (self.labels is not None):
				y = self.labels[self.batch_index: self.batch_index + self.batch_size]

			self.batch_index+=self.batch_size
		if(self.labels is not None):
			return x,y
		return x
Beispiel #10
0
    def __call__(self, names_and_labels, shuffle=False):
        batches = []
        ids_and_names = []
        batch_size = self.batch_size

        rows = Parallel(n_jobs=self.n_jobs)(
            delayed(_process_item)(self, name)
            for name, label in names_and_labels)

        names_and_labels = [
            v for (v, row) in zip(names_and_labels, rows) if row is not None
        ]
        for id, (name, label) in enumerate(names_and_labels):
            ids_and_names.append((id, name))
        data = np.vstack([r for r in rows if r is not None])

        if shuffle:
            from sklearn.utils import shuffle as skshuffle
            names_and_labels, ids_and_names, data = skshuffle(
                names_and_labels, ids_and_names, data)

        labels_sorted = sorted(set(p[1] for p in names_and_labels))
        labels = [
            labels_sorted.index(label) for name, label in names_and_labels
        ]
        ids = [id for (id, fname) in ids_and_names]
        data = self.preprocess_data(data)
        data_mean = data.mean(axis=0)

        for batch_start in range(0, len(names_and_labels), batch_size):
            batch = {'data': None, 'labels': []}
            batch_end = batch_start + batch_size

            batch['data'] = data[batch_start:batch_end, :].T
            batch['labels'] = labels[batch_start:batch_end]
            batch['ids'] = ids[batch_start:batch_end]
            batches.append(batch)
            self.dot()

        for i, batch in enumerate(batches):
            path = os.path.join(self.output_path, 'data_batch_%s' % (i + 1))
            with open(path, 'wb') as f:
                cPickle.dump(batch, f, -1)
                self.dot()

        batches_meta = {}
        batches_meta['label_names'] = labels_sorted
        batches_meta['pack_columns'] = ['name']
        batches_meta['packs'] = [(os.path.basename(name), )
                                 for id, name in ids_and_names]
        batches_meta['data_mean'] = data_mean.reshape(data_mean.shape[0], 1)
        batches_meta['num_vis'] = data.shape[0]
        batches_meta.update(self.more_meta)

        with open(os.path.join(self.output_path, 'batches.meta'), 'wb') as f:
            cPickle.dump(batches_meta, f, -1)
            self.dot()

        print
        print "Wrote to %s" % self.output_path
Beispiel #11
0
def evaluate(user_df, features_matrix):
    features_matrix = features_matrix[user_df["user_id"].to_list(),]
    print(features_matrix.shape)
    nodesize = features_matrix.shape[0]
    label_matrix = user_df["age"]
    label_matrix = label_matrix.to_numpy()
    label_matrix = np.stack([label_matrix]).T - 1
    train_percent = 0.7

    random.seed(1)
    np.random.seed(1)
    res = []
    for i in range(4):
        t_1 = time.time()
        X, y = skshuffle(features_matrix, label_matrix)
        training_size = int(train_percent * nodesize)
        X_train = X[:training_size, :]
        y_train = y[:training_size, :]
        X_test = X[training_size:, :]
        y_test = y[training_size:, :]

        clf = LogisticRegression(random_state=0, solver="saga", multi_class="multinomial")
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        acc = (preds == y_test[:, 0]).sum() / len(y_test)
        res.append(acc)
        print(time.time() - t_1, "s")
    print("avg age acc:", sum(res) / len(res))
    print("min age acc:", min(res))
    print("max age acc:", max(res))
Beispiel #12
0
    def _evaluate(self, embeddings, labels):
        shuffles = []
        for _ in range(self.num_shuffle):
            shuffles.append(skshuffle(embeddings, labels))
        all_results = defaultdict(list)
        training_percents = [0.1, 0.3, 0.5, 0.7, 0.9]
        for training_percent in training_percents:
            for shuf in shuffles:
                training_size = int(training_percent * self.num_graphs)
                X, y = shuf
                X_train = X[:training_size, :]
                y_train = y[:training_size]

                X_test = X[training_size:, :]
                y_test = y[training_size:]

                clf = SVC()
                clf.fit(X_train, y_train)

                preds = clf.predict(X_test)
                accuracy = accuracy_score(y_test, preds)
                all_results[training_percent].append(accuracy)

        return dict((
            f"Accuracy {train_percent}",
            sum(all_results[train_percent]) / len(all_results[train_percent]),
        ) for train_percent in sorted(all_results.keys()))
Beispiel #13
0
    def train(self, method):
        trainRatio = np.arange(0.1, 1, 0.1)
        resultMicro, resultMacro = [], []
        for ratio in trainRatio:
            print('\r',
                  "Training with {}% labeled nodes...".format(int(100 *
                                                                  ratio)),
                  end='',
                  flush=True)
            resultMicroMat, resultMacroMat = [], []
            for _ in np.arange(10):  # Train 10 times.
                X, y = skshuffle(self.featureMat, self.labelsMat)
                trainSize = int(ratio * X.shape[0])
                X_train = X[:trainSize]
                y_train = y[:trainSize]
                X_test = X[trainSize:]
                y_test = y[trainSize:]
                clf = TopKRanker(LogisticRegression(solver='liblinear'))
                clf.fit(X_train, y_train)
                topKList = np.diff(y_test.tocsr().indptr)
                preds = clf.predict(X_test, topKList)
                resultMicro.append(f1_score(y_test, preds, average='micro'))
                resultMacro.append(f1_score(y_test, preds, average='macro'))
            resultMicroMat.append(resultMicro)
            resultMacroMat.append(resultMacro)

        np.save(
            '../data/{}/{}/results/resultsMicro_{}.npy'.format(
                self.graphName, method, self.name), np.array(resultMicroMat))
        np.save(
            '../data/{}/{}/results/resultsMacro_{}.npy'.format(
                self.graphName, method, self.name), np.array(resultMacroMat))
        return np.array(resultMicroMat), np.array(resultMacroMat)
Beispiel #14
0
def main():
  group_edges_path = \
    '../data/blogcatelog/BlogCatalog-dataset/data/group-edges.csv'

  parser = ArgumentParser("Calculate BlogCatalog F1 score.")
  parser.add_argument("--emb_dir", default='../tf/deepwalk',
                      help="Path of directory containing embeddings.")
  parser.add_argument("--shuffles", default=2, type=int,
                      help='Number of shuffles.')
  args = parser.parse_args()

  # load and process embeddings
  embeds = np.load(args.emb_dir + '/id_emb.npy')
  id_map = {}
  for i in range(embeds.shape[0]):
    id_map[embeds[i][0]] = i
  features_matrix = embeds[[id_map[k] for k in range(NODE_NUM)]][:, 1:]

  # load and process labels
  labels = np.zeros((NODE_NUM, CLASS_NUM))
  with open(group_edges_path) as csvfile:
    lines = csv.reader(csvfile)
    for line in lines:
      node_id = int(line[0]) - 1
      group_id = int(line[1]) - 1
      labels[node_id][group_id] = 1

  # train and test args.shuffles times.
  shuffles = []
  for _ in range(args.shuffles):
    shuffles.append(skshuffle(features_matrix, labels))
  train_and_test(shuffles)
Beispiel #15
0
    def _evaluate(self, features_matrix, label_matrix, num_shuffle):
        shuffles = []
        for _ in range(num_shuffle):
            shuffles.append(skshuffle(features_matrix, label_matrix))
        all_results = defaultdict(list)
        training_percents = [0.3, 0.5, 0.7, 0.9]

        for train_percent in training_percents:
            for shuf in shuffles:
                X, y = shuf

                training_size = int(train_percent * len(X))

                X_train = X[:training_size, :]
                y_train = y[:training_size, :]

                X_test = X[training_size:, :]
                y_test = y[training_size:, :]

                clf = TopKRanker(LogisticRegression())
                clf.fit(X_train, y_train)

                # find out how many labels should be predicted
                top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0]))
                preds = clf.predict(X_test, top_k_list)
                result = f1_score(y_test, preds, average="micro")
                all_results[train_percent].append(result)

        return dict((
            f"Micro-F1 {train_percent}",
            sum(all_results[train_percent]) / len(all_results[train_percent]),
        ) for train_percent in sorted(all_results.keys()))
Beispiel #16
0
    def __call__(self, names_and_labels, shuffle=False):
        batches = []
        ids_and_names = []
        batch_size = self.batch_size

        rows = Parallel(n_jobs=self.n_jobs)(
            delayed(_process_item)(self, name)
            for name, label in names_and_labels
            )

        names_and_labels = [v for (v, row) in zip(names_and_labels, rows)
                            if row is not None]
        for id, (name, label) in enumerate(names_and_labels):
            ids_and_names.append((id, name))
        data = np.vstack([r for r in rows if r is not None])

        if shuffle:
            from sklearn.utils import shuffle as skshuffle
            names_and_labels, ids_and_names, data = skshuffle(
                names_and_labels, ids_and_names, data)

        labels_sorted = sorted(set(p[1] for p in names_and_labels))
        labels = [labels_sorted.index(label)
                  for name, label in names_and_labels]
        ids = [id for (id, fname) in ids_and_names]
        data = self.preprocess_data(data)
        data_mean = data.mean(axis=0)

        for batch_start in range(0, len(names_and_labels), batch_size):
            batch = {'data': None, 'labels': []}
            batch_end = batch_start + batch_size

            batch['data'] = data[batch_start:batch_end, :].T
            batch['labels'] = labels[batch_start:batch_end]
            batch['ids'] = ids[batch_start:batch_end]
            batches.append(batch)
            self.dot()

        for i, batch in enumerate(batches):
            path = os.path.join(self.output_path, 'data_batch_%s' % (i + 1))
            with open(path, 'wb') as f:
                cPickle.dump(batch, f, -1)
                self.dot()

        batches_meta = {}
        batches_meta['label_names'] = labels_sorted
        batches_meta['pack_columns'] = ['name']
        batches_meta['packs'] = [
            (os.path.basename(name),) for id, name in ids_and_names]
        batches_meta['data_mean'] = data_mean.reshape(data_mean.shape[0], 1)
        batches_meta['num_vis'] = data.shape[0]
        batches_meta.update(self.more_meta)

        with open(os.path.join(self.output_path, 'batches.meta'), 'wb') as f:
            cPickle.dump(batches_meta, f, -1)
            self.dot()

        print
        print "Wrote to %s" % self.output_path
    def shuffle(self):
        """ Shuffles the data and labels. """

        self.data, self.labels = skshuffle(self.data,
                                           self.labels,
                                           random_state=self.seed)

        self.Helpers.logger.info("Data shuffled")
Beispiel #18
0
    def shuffle(self):
        """Summary

        Returns:
            TYPE: Description
        """
        idx = list(range(len(self)))
        reindex = skshuffle(idx)
        self.change_idx(reindex)
    def prepare(self, P_, N_, shuffle=True, verbose=False, max_parallel_process=8):
        """Prepare the data for training.

        Parameters
        ----------
        P_ : array-like of shape = [n_positive_samples, height, width]
            The positive samples.
        N_ : array-like of shape = [n_negetive_samples, height, width]
            The negetive samples.
        shuffle : bool
            Whether to shuffle the data or not.
        """
        assert np.shape(P_)[1:3] == np.shape(N_)[1:3], "Window sizes mismatch."
        _, self.detectWndH, self.detectWndW = np.shape(P_)

        self.features_cnt, descriptions = \
            self.Haarlike.determineFeatures(self.detectWndW, self.detectWndH)
        self.features_descriptions = descriptions[::-1]

        if shuffle:
            # If P_ is a list, this is faster than
            # P_ = np.array(skshuffle(P_, random_state=1))
            P_ = skshuffle(np.array(P_), random_state=1)
            N_ = skshuffle(np.array(N_), random_state=1)

        if verbose: print('Preparing positive data.')
        P = self._translate(P_, verbose=verbose, max_parallel_process=max_parallel_process)
        if verbose: print('Preparing negative data.')
        N = self._translate(N_, verbose=verbose, max_parallel_process=max_parallel_process)

        divlineP = int(len(P)*self.validset_rate)
        divlineN = int(len(N)*self.validset_rate)

        validset_X = np.concatenate(( P[0:divlineP], N[0:divlineN] ))
        validset_y = np.concatenate(( np.ones(len(P[0:divlineP])), np.zeros(len(N[0:divlineN])) ))
        # validset_X, validset_y = skshuffle(validset_X, validset_y, random_state=1)

        P = P[divlineP:len(P)]
        N = N[divlineN:len(N)]

        self.P = P
        self.N = N
        self.validX = validset_X
        self.validy = validset_y
Beispiel #20
0
    def shuffle(self):
        """Summary

        Returns:
            TYPE: Description
        """
        idx = list(range(len(self)))
        reindex = skshuffle(idx)
        self.props = {key: val[reindex] for key, val in self.props.items()}

        return
Beispiel #21
0
def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []
    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches
Beispiel #22
0
def evaluate():
    args = parse_args()
    features_matrix = load_embeddings(args.emb)
    print(features_matrix.shape)
    nodesize = features_matrix.shape[0]
    label_matrix = load_labels(args.label, nodesize)
    number_shuffles = args.shuffle

    shuffles = []
    for x in range(number_shuffles):
        shuffles.append(skshuffle(features_matrix, label_matrix))

    all_results = defaultdict(list)

    training_percents = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for train_percent in training_percents:
        for shuf in shuffles:
            X, y = shuf
            training_size = int(train_percent * nodesize)

            X_train = X[:training_size, :]
            y_train = y[:training_size, :]

            X_test = X[training_size:, :]
            y_test = y[training_size:, :]

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train)

            # find out how many labels should be predicted
            top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0]))
            preds = clf.predict(X_test, top_k_list)

            results = {}
            averages = ["micro", "macro", "samples", "weighted"]
            for average in averages:
                results[average] = f1_score(y_test, preds, average=average)

            all_results[train_percent].append(results)
    print('Results, using embeddings of dimensionality', X.shape[1])
    print('-------------------')
    print('Train percent:', 'average f1-score')
    for train_percent in sorted(all_results.keys()):
        av = 0
        stder = np.ones(number_shuffles)
        i = 0
        for x in all_results[train_percent]:
            stder[i] = x["micro"]
            i += 1
            av += x["micro"]
        av /= number_shuffles
        print(train_percent, ":", av)
    def _evaluate(self, features_matrix, label_matrix, num_shuffle):
        # features_matrix, node2id = utils.load_embeddings(args.emb)
        # label_matrix = utils.load_labels(args.label, node2id, divi_str=" ")

        # shuffle, to create train/test groups
        shuffles = []
        for _ in range(num_shuffle):
            shuffles.append(skshuffle(features_matrix, label_matrix))

        # score each train/test group
        all_results_micro = defaultdict(list)
        all_results_macro = defaultdict(list)
        # training_percents = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        training_percents = [0.1, 0.3, 0.5, 0.7, 0.9]

        for train_percent in training_percents:
            for shuf in shuffles:
                X, y = shuf

                training_size = int(train_percent * self.num_nodes)

                X_train = X[:training_size, :]
                y_train = y[:training_size, :]

                X_test = X[training_size:, :]
                y_test = y[training_size:, :]

                clf = TopKRanker(LogisticRegression())
                clf.fit(X_train, y_train)

                # find out how many labels should be predicted
                top_k_list = list(map(int, y_test.sum(axis=1).T.tolist()[0]))
                preds = clf.predict(X_test, top_k_list)
                result = f1_score(y_test, preds, average="micro")
                all_results_micro[train_percent].append(result)

                result = f1_score(y_test, preds, average="macro")
                all_results_macro[train_percent].append(result)
            # print("micro", result)

        micro_ = dict((
            f"Micro-F1 {train_percent}",
            sum(all_results_micro[train_percent]) /
            len(all_results_micro[train_percent]),
        ) for train_percent in sorted(all_results_micro.keys()))
        macro_ = dict((
            f"Macro-F1 {train_percent}",
            sum(all_results_macro[train_percent]) /
            len(all_results_macro[train_percent]),
        ) for train_percent in sorted(all_results_macro.keys()))
        micro_.update(macro_)
        return micro_
Beispiel #24
0
 def getTrainList(self, batch_size=1000, shuffle=True):
     n_batch = self.n_train // batch_size
     batch_list = []
     X_shuffle, Y_shuffle = self.X_train.copy(), self.Y_train.copy()
     if shuffle:
         X_shuffle, Y_shuffle = skshuffle(self.X_train, self.Y_train)
     for i in range(n_batch):
         X, Y = X_shuffle[batch_size * i:batch_size *
                          (i + 1)], Y_shuffle[batch_size * i:batch_size *
                                              (i + 1)]
         X, Y = self.to_tensor(X, Y, self.device)
         batch_list.append((X, Y))
     return batch_list
Beispiel #25
0
def get_minibatch(X, y, minibatch_size, shuffle=True):
    minibatches = []

    if shuffle:
        X, y = skshuffle(X, y)

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]

        minibatches.append((X_mini, y_mini))

    return minibatches
Beispiel #26
0
def make_xgb(params, X_tr, y_tr, test, nr, rs=1):
    par = params.copy()
    par['seed'] = rs
    plst = par.items()
    np.random.seed(seed=rs + 123)
    num_round = nr
    X_tr, y_tr = skshuffle(X_tr, y_tr, random_state=rs + 123)
    noise = np.random.normal(0, 0.5, len(y_tr))
    dtrain = xgb.DMatrix(X_tr, label=y_tr)  #+noise)#,missing=-1.0)
    dtest = xgb.DMatrix(test)  #,missing=-1.0)
    model = xgb.train(plst, dtrain, num_round)  #,obj=KappaRelaxedObjective())
    pred = model.predict(dtest)
    return pred
def batch_iter(data, batch_size, shuffle=True):
    batch = []
    shuffled_data = np.copy(data)
    if shuffle:
        shuffled_data = skshuffle(shuffled_data)

    for line in shuffled_data:
        batch.append(line)
        if len(batch) == batch_size:
            yield tuple(list(x) for x in zip(*batch))
            # yield batch
            batch = []
    if batch:
        yield tuple(list(x) for x in zip(*batch))
Beispiel #28
0
    def shuffle(self):
        """Summary

        Returns:
            TYPE: Description
        """
        idx = list(range(len(self)))
        reindex = skshuffle(idx)
        for key, val in self.props.items():
            if isinstance(val, list):
                self.props[key] = [val[i] for i in reindex]
            else:
                self.props[key] = val[reindex]

        return
Beispiel #29
0
def batch_iter(data, batch_size, shuffle=False, random_state=123):
    batch = []
    shuffled_data = np.copy(data)

    if shuffle:
        shuffled_data = skshuffle(shuffled_data, random_state=random_state)

    for line in shuffled_data:
        batch.append(line)
        if len(batch) == batch_size:
            yield tuple(list(x) for x in zip(*batch))
            # yield batch
            batch = []
    if batch:
        yield tuple(list(x) for x in zip(*batch))
Beispiel #30
0
def generator(samples,newsize,batch_size=32,max_count=200):
    zero_thresh = 0.3
    augment_prob = .3
    zero_reject_range = 0.2
    crop_on = 1
    mult_camera = False
    increment_zero_thresh = 0
    loop_count = 0
    while 1: # Loop forever so the generator never terminates
        print('  GETTING NEW SAMPLES ' )
        print('loop_count = ', loop_count)
        np.random.shuffle(samples)
        # create binned version of larger sample base with flatter distribution
        bin_samples = distribute_samples(samples,max_count=max_count)
        num_samples = len(bin_samples)
        loop_count += 1
        if loop_count > 2 and increment_zero_thresh:
            zero_thresh += 0.1  # experimented with this, but not used in end
            print('zero_thresh = ',zero_thresh)
        for offset in range(0, num_samples, batch_size):
            batch_samples = bin_samples[offset:offset+batch_size]
            images = []
            angles = []
            for batch in batch_samples:
                if zero_thresh < 1.0: # if rejecting small angles
                    keep_search = 1
                    while keep_search: # search for sample meeting criteria
                        rand_indx = random.randint(0,len(bin_samples)-1)
                        line_sample = bin_samples[rand_indx]
                        if abs(float(line_sample[3])) < zero_reject_range:
                            if random.random() < zero_thresh:
                                keep_search = 0
                        else:
                            keep_search = 0
                else:
                    line_sample = batch

                image,angle,im_indx = process_image_pipeline(line_sample,
                                                        augment_prob,crop_on,
                                                        newsize,
                                                        mult_camera=mult_camera)
                images.append(image)
                angles.append(angle)


            X_train = np.array(images)
            y_train = np.array(angles)
            yield skshuffle(X_train, y_train)
Beispiel #31
0
 def getTrainList(self, batch_size=1000, shuffle=True, aug=False):
     n_batch = self.n_train // batch_size
     batch_list = []
     X_shuffle, Y_shuffle = self.X_train.copy(), self.Y_train.copy()
     if shuffle:
         X_shuffle, Y_shuffle = skshuffle(self.X_train, self.Y_train)
     for i in range(n_batch):
         X, Y = X_shuffle[batch_size * i:batch_size *
                          (i + 1)], Y_shuffle[batch_size * i:batch_size *
                                              (i + 1)]
         if aug:
             X = self.random_crop(X)
             X = self.horizontal_flip(X)
         X, Y = self.to_tensor(X, Y, self.device)
         batch_list.append((X, Y))
     return batch_list
Beispiel #32
0
	def __init__(self, data,labels= None,shuffle =True, batch_size=32 ):
		if(labels is None):
			np.random.shuffle(data)
			self.data = data
			self.labels=None

		else:
			self.labels = labels
			self.data = data
			self.data, self.labels = skshuffle(self.data, self.labels)

		self.data_dim = len(data[0])
		self.lenght_data = len(data)
		self.step_count=0
		self.batch_size = batch_size
		self.batch_index=0
		self.shuffle = shuffle
    def __call__(self, names_and_labels, shuffle=False):
      # names_and_labels: nparray of (full-file-paths, label) tuples
      batches = []
      ids_and_names = []
      batch_size = self.batch_size

      # rows is a list of serialised jpgs in names_and_labels order?
      rows = Parallel(n_jobs=self.n_jobs)(
        delayed(_process_item)(self, name)
        for name, label in names_and_labels
        )

      # why do we need this step? to get rid of cases where 
      # _process_item did not process a jpg? (could not find one, or
      # chose not to?)
      names_and_labels=[v for (v, row) in zip(names_and_labels, rows)
                        if row is not None]

      for id, (name, label) in enumerate(names_and_labels):
        ids_and_names.append((id, name))
      data = np.vstack([r for r in rows if r is not None])

      if shuffle:
        from sklearn.utils import shuffle as skshuffle
        names_and_labels, ids_and_names, data = skshuffle(
          names_and_labels, ids_and_names, data)

      labels_sorted = sorted(set(p[1] for p in names_and_labels))
      labels = [labels_sorted.index(label)
                for name, label in names_and_labels]
      ids = [id for (id, fname) in ids_and_names]

      data = self.preprocess_data(data) # does nothing

      for batch_start in range(0, len(names_and_labels), batch_size):
        batch = {'data': None, 'labels': [], 'metadata': []}
        batch_end = batch_start + batch_size

        batch['data'] = data[batch_start:batch_end, :].T
        batch['labels'] = labels[batch_start:batch_end]
        batch['ids'] = ids[batch_start:batch_end]
        batches.append(batch)
        self.dot()

      for i, batch in enumerate(batches):
        path = os.path.join(self.output_path, 'data_batch_%s' % (i + 1))
        with open(path, 'wb') as f:
          cPickle.dump(batch, f, -1)
          self.dot()

        batches_meta = {}
        batches_meta['label_names'] = labels_sorted
        batches_meta['metadata'] = dict(
          (id, {'name': name}) for (id, name) in ids_and_names)
        batches_meta['data_mean'] = data.mean(axis=0)
        batches_meta.update(self.more_meta)

      with open(os.path.join(self.output_path, 'batches.meta'), 'wb') as f:
        cPickle.dump(batches_meta, f, -1)
        self.dot()

      print
      print "Wrote to %s" % self.output_path
Beispiel #34
0
def eval_blogcat(embeddings_file, labels_matrix=None, G=None,
                 verbose=1, normalize=1, training_percents=[0.1, 0.6, 0.9]):

    # 0. Files
    #embeddings_file = "/mnt/raid1/deepwalk/blogcatalog.vec"
    if labels_matrix is None and G is None:
        G, labels_matrix = load_blogcat()
    
    # 1. Load Embeddings
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

    labels = np.argwhere(labels_matrix)
    label_cnts = pd.Series(labels[:,1]).value_counts()

    if verbose > 1:
        print('\nLabel counts:')
        print(label_cnts)

    # delete the least frequent labels, which causes balancing problems
    labels_matrix = labels_matrix[:, :-2]

    # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) 
    features_matrix = np.asarray([model[str(node)] for node in range(len(G))])

    if normalize:
        norms = np.linalg.norm(features_matrix, axis=1)
        if verbose:
            print norms
            print norms.shape

        assert norms.shape[0] == features_matrix.shape[0]
        for i in range(features_matrix.shape[0]):
            features_matrix[i,:] /= norms[i]

        norms = np.linalg.norm(features_matrix, axis=1)
        if verbose:
            print norms

    if verbose:
        print('-'*100)
        print(embeddings_file)
        print('features_matrix.shape = %s' % str(features_matrix.shape))
        print('labels_matrix.shape   = %s' % str(labels_matrix.shape))

    # 2. Shuffle, to create train/test groups
    shuffles = []
    number_shuffles = 1
    for x in range(number_shuffles):
        # if we just have one group, make the split the same every time
        if number_shuffles == 1:
            shuffles.append(skshuffle(features_matrix, labels_matrix, random_state=123))
        else:
            shuffles.append(skshuffle(features_matrix, labels_matrix))

    # 3. to score each train/test group
    all_results = defaultdict(list)

    # uncomment for all training percents
    #training_percents = np.asarray(range(1,10))*.1
    for train_percent in training_percents:
        # print('-'*100)
        # print('pct_train: %.2f' % train_percent)

        for shuf in shuffles:
            X, y = shuf
            training_size = int(train_percent * X.shape[0])

            X_train = X[:training_size, :]
            y_train = y[:training_size]
            X_test = X[training_size:, :]
            y_test = y[training_size:]

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train)

            # find out how many labels should be predicted
            #top_k_list = [len(l) for l in y_test]
            top_k_list = np.array(np.sum(y_test, axis=1).flatten()[0])[0].astype(np.int32)
            preds = clf.predict(X_test, top_k_list)

            if y_test.shape[1] != preds.shape[1]:
                raise Exception("imbalance of class dims")
                #continue
            
            results = OrderedDict()
            averages = ["micro", "macro", "samples", "weighted"]
            for average in averages:
                results[average] = f1_score(y_test, preds, average=average)

            all_results[train_percent].append(results)
            #break

    if verbose:
        print '-------------------'
        for train_percent in sorted(all_results.keys()):
            print 'Train percent:', train_percent
            for x in all_results[train_percent]:
                print  x
            print '-------------------'
    return all_results
Beispiel #35
0
                                      norm_only=False)

# 2. Load labels
mat = loadmat(matfile)
A = mat['network']
graph = sparse2graph(A)
labels_matrix = mat['group']

# Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))])

# 2. Shuffle, to create train/test groups
shuffles = []
number_shuffles = 2
for x in range(number_shuffles):
  shuffles.append(skshuffle(features_matrix, labels_matrix))

# 3. to score each train/test group
all_results = defaultdict(list)

training_percents = [0.1, 0.5, 0.9]
# uncomment for all training percents
#training_percents = numpy.asarray(range(1,10))*.1
for train_percent in training_percents:
  for shuf in shuffles:

    X, y = shuf

    training_size = int(train_percent * X.shape[0])

    X_train = X[:training_size, :]