def eval_input_fn(params): data = Data(dataset='WN18', reverse=True) validation_data = data.get_inputs_and_targets() ds = validation_data return ds
class kddNShot: def __init__(self, path, batch_size): self.data_generator = Data(path, batch_size) self.norm_adj = self.data_generator.get_adj_mat() def next(self): return self.data_generator.sample() def get_test(self): return self.data_generator.sample('test')
def predict_input_fn(params): batch_size = params["batch_size"] data = Data(dataset='WN18', reverse=True) test_data = data.get_inputs_and_targets() # Take out top 10 samples from test data to make the predictions. ds = test_data.take(10).batch(batch_size) return ds
def train_input_fn(params): """train_input_fn defines the input pipeline used for training.""" # Retrieves the batch size for the current shard. The # of shards is # computed according to the input pipeline deployment. See # `tf.contrib.tpu.RunConfig` for details. data = Data(dataset='WN18', reverse=True) train_data = data.get_inputs_and_targets(training=True) ds = train_data.shuffle(buffer_size=1000).repeat() return ds
def make_data(dataset_name): if dataset_name == 'sst': dataset = Data(dataset_name) texts, labels = load_sst_train() x_train = dataset.lt_to_int([t.lower() for t in texts]) y_train = labels x_train_raw = texts x_test = dataset.x_val y_test = dataset.y_val x_test_raw = dataset.x_val_raw return x_train, y_train, x_train_raw, x_test, y_test, x_test_raw
def init(self): # data dataset = self.comboBox.currentText() data_process = self.comboBox_2.currentText() test_size = float(self.doubleSpinBox.value()) data, label = Data().load(dataset, data_process) if data is None: self.textBrowser.setText("[Error]: input error" + str(label)) return self.train_data, self.test_data, self.train_label, self.test_label = train_test_split( data, label, test_size=test_size) n_inputs = len(data[0]) self.label = list(set(i for i in label)) self.n_outputs = len(self.label) n_hidden_layer = int(self.spinBox.value()) hidden_layer = self.lineEdit.text().split() if len(hidden_layer) == 0: self.textBrowser.setText("[Error]: input error") return try: hidden_layer = list(map(int, hidden_layer[:n_hidden_layer])) except Exception as e: self.textBrowser.setText("[Error]: input error, " + str(e)) return self.MLP = MLP_Model(n_inputs, hidden_layer, self.label) self.textBrowser.setText("[Success]: init sucesss") self.mode = 'init'
def evaluate(self, data, xs, selected): Xnews_value = self.predict(xs, selected) dataset = Data(data, False) ori_model = TextModel(data) preds_ori = ori_model.predict(Xnews_value) acc_ori = np.mean( np.argmax(preds_ori, axis=-1) == np.argmax(dataset.pred_val, axis=-1)) return acc_ori
def L2X(args): from build_gumbel_selector import Gumbel_Selection, Gumbel_Selection_Char if args.data == 'agccnn': gumbel_selector = Gumbel_Selection_Char(args.num_feats, args.data, args.train, args.original, args.mask) else: gumbel_selector = Gumbel_Selection( args.num_feats, args.data, args.train, args.original, args.mask, ) if args.train: return None, None else: if args.train_score: dataset = Data(args.data, True) scores_val = gumbel_selector.predict(dataset.x_val) np.save( '{}/results/scores-val-{}-{}-original{}-mask{}.npy'.format( args.data, args.method, args.num_feats, args.original, args.mask), scores_val) scores_train = gumbel_selector.predict(dataset.x_train) np.save( '{}/results/scores-train-{}-{}-original{}-mask{}.npy'.format( args.data, args.method, args.num_feats, args.original, args.mask), scores_train) dataset = Data(args.data, False) st = time.time() scores = gumbel_selector.predict(dataset.x_val) print('Time spent is {}'.format(time.time() - st)) return scores, [time.time() - st]
def prepare_data(self, data_fields, wv_size=600): test_data = Data(self.file_name, self.file_path) test_df = test_data.csv_df(data_fields) # make a copy of the original tweets for later use original_df = test_df.copy() # pre-process data(same as how we trained) test_data.pre_process(test_df) # then convert using word2vec model = test_data.build_wordvec(size=wv_size, verbose=False) # take a look of the max_len of testing. although we still have to use max_len from train max_len_test = test_data.max_len(test_df) data = test_data.convert2vec(test_df, self.max_len_train, model, name='test_' + self.file_name) test_data.save_vec(data, name='test_' + self.file_name) self.data = data self.test_data = test_data self.test_df = test_df self.original_df = original_df print ">>>Done preparing data.<<<\n"
def gumbel(args): from build_gumbel_transformer import Gumbel_Transform, Gumbel_Transform_Char if args.data == 'agccnn': gumbel_transform = Gumbel_Transform_Char(args.data, args.num_feats, args.method, args.train, args.original, args.mask) else: gumbel_transform = Gumbel_Transform(args.data, args.num_feats, args.max_words, args.method, args.train, args.original, args.mask) if not args.train: dataset = Data(args.data) if args.method == 'L2X': scores = np.load( '{}/results/scores-{}-{}-original{}-mask{}.npy'.format( args.data, args.method, args.num_feats, args.original, args.mask)) elif args.method == 'leave_one_out': scores = np.load('{}/results/scores-{}.npy'.format( args.data, args.method)) changed_xs = [] st = time.time() for k in xrange(1, args.num_feats + 1): selected_index = np.argsort( scores, axis=-1)[:, -k:] # indices of largest k score. selected = np.zeros(scores.shape) selected[np.expand_dims(np.arange(len(scores)), axis=-1), selected_index] = 1.0 changed_x = gumbel_transform.predict(dataset.x_val, selected) changed_xs.append(changed_x) changed_xs = np.array(changed_xs) changed_xs = np.swapaxes(changed_xs, 0, 1) return changed_xs, [time.time() - st] return None, None
def train_bow(dataset_name): # Train the BoW model. data_model = dataset_name + 'bow' dataset = Data(dataset_name) x_train, y_train = dataset.x_train_raw, np.argmax(dataset.y_train, axis=1) x_test, y_test = dataset.x_val_raw, np.argmax(dataset.y_val, axis=1) print('Fitting transform...') vectorizer = CountVectorizer(max_features=20000) x_train_bow = vectorizer.fit_transform(x_train) x_test_bow = vectorizer.transform(x_test) print('Fitting logistic regression...') clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial') clf.fit(x_train_bow, y_train) print('Making prediction...') pred_test = clf.predict_proba(x_test_bow) acc_train = clf.score(x_train_bow, y_train) acc_test = clf.score(x_test_bow, y_test) print('The training accuracy is {}; the test accuracy is {}.'.format( acc_train, acc_test)) # print('The size of bow transformer is {} MB.'.format(sys.getsizeof(vectorizer) * 1e-6)) print('Save model to pickle...') if data_model not in os.listdir('.'): os.mkdir(data_model) with open('{}/vectorizer.pkl'.format(data_model), 'wb') as f: pkl.dump(vectorizer, f) with open('{}/clf.pkl'.format(data_model), 'wb') as f: pkl.dump(clf, f)
def create_original_predictions(args): # save original validation prediction probabilities. dataset = Data(args.data, True) model = TextModel(args.data, False) pred_val = model.predict(dataset.x_val, verbose=True) pred_train = model.predict(dataset.x_train, verbose=True) if 'data' not in os.listdir(args.data): os.mkdir('{}/data'.format(args.data)) np.save('{}/data/pred_val.npy'.format(args.data), pred_val) np.save('{}/data/pred_train.npy'.format(args.data), pred_train) acc_val = np.mean( np.argmax(pred_val, axis=1) == np.argmax(dataset.y_val, axis=1)) acc_train = np.mean( np.argmax(pred_train, axis=1) == np.argmax(dataset.y_train, axis=1)) print('The validation accuracy is {}.'.format(acc_val)) print('The training accuracy is {}.'.format(acc_train)) if args.data != 'agccnn': np.save('{}/data/embedding_matrix.npy'.format(args.data), model.emb_weights)
yhat = model.predict(X_test, batch_size=minibatch_size, verbose=1) yhat = [1 if x > 0.5 else -1 for x in yhat] print('Test accuracy: ' + str(accuracy_score(Y_test, yhat))) # MAIN # . # . # . # . # . # . # . # . Data.load_data(momentum_window=30, X_window_average=30, newsTimeToMarket=0) (X_train, Y_train), (X_test, Y_test) = Data.get_train_test_set() test_x = tf.convert_to_tensor(X_test, dtype=tf.float32) train_x = tf.convert_to_tensor(X_train, dtype=tf.float32) train_y = tf.convert_to_tensor(Y_train, dtype=tf.float32) test_y = tf.convert_to_tensor(Y_test, dtype=tf.float32) print('.........................') print("number of training examples = " + str(train_x.shape[0])) print("number of test examples = " + str(test_x.shape[0])) print("X_train shape: " + str(train_x.shape)) print("Y_train shape: " + str(train_y.shape)) print("X_test shape: " + str(test_x.shape))
from load_data import Data from model import Model from train import Train if __name__ == "__main__": data = Data() data.load() data.data_augment() data.data_splitting() data.print() dataset = data.get_dataset() testset = data.get_testset() models = Model(dataset[0].shape[1:], 50) m = models.ResNet() m.summary() # train = Train(m, dataset, testset, 50, 32, 'adam', 'sparse_categorical_crossentropy') train = Train(m, dataset, testset, 100, 200, 'adam', 'categorical_crossentropy') train.training() train.evaluate() # tensorboard --logdir logs/scalars --port=7000
config.pdata = args.pdata config.data_opt = args.data_opt print(args.bk) dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True # For reproducibility seed = 20 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=True, subset_percentage=args.pdata, data_opt=args.data_opt) experiment = Experiment(num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.lr, decay_rate=args.dr, ent_vec_dim=args.edim, rel_vec_dim=args.rdim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout1=args.hidden_dropout1, hidden_dropout2=args.hidden_dropout2, label_smoothing=args.label_smoothing, bk=args.bk) path = 'model_state.pts' if args.bk: path = 'model_state_sym.pts'
def main(config: argparse.Namespace) -> None: # TODO docstring with open(config.config_file, 'r') as cfg: experiments: dict = yaml.load(cfg) print('loading data') data = Data(config.yelp_file, config.geneea_file) data.print(f'Processing file {config.config_file}') print('generating samples') datasize: int = data.generate_sample(experiments['config']['chunks'], LikeTypeEnum.USEFUL) stats: DataGraph = DataGraph('', 'number of instances', 'percentage') # texts_tokenized = (self._tokenize(row.text) for index, row # in self.data.iterrows()) # words_freqs = nltk.FreqDist(w.lower() for tokens in texts_tokenized # for w in tokens) # # # TODO statistics # # for x in all_words: # # print(all_words[x]) # # # self.print('total number of words:', sum(all_words.values())) # # self.print('unique words:', len(all_words)) # # self.print('words present only once:', # # sum(c for c in all_words.values() if c == 1)) # # all_words.plot(30) # # # only the right frequencies # self.gram_words = words_freqs.copy() # for w, count in words_freqs.items(): # if count > 200 or count == 20: # # TODO Measure # del self.gram_words[w] # # self.gram_words = frozenset(self.gram_words.keys()) # calculate mutual information of all features if wanted # and dump it into text files if experiments['config']['mi']: for x in FeatureSetEnum: if x == FeatureSetEnum.BIGRAMS or \ x == FeatureSetEnum.TRIGRAMS or \ x == FeatureSetEnum.FOURGRAMS: continue if x == FeatureSetEnum.UNIGRAMS: # TODO REMOVE continue # get data data.set_statfile(f'mi_{x}') data.print(f'Mutual Information of {x}.') train = data.get_feature_dict(SampleTypeEnum.TRAIN, {x}) test = data.get_feature_dict(SampleTypeEnum.TEST, {x}) instances = train + test # get matrix matrix_convertor = featurematrixconversion.Preprocessor({}) vector_instances = matrix_convertor.process( instances, SampleTypeEnum.TRAIN) # calculate mutual info matrix_gen, labels_gen = zip(*vector_instances) matrix = sparse.vstack(matrix_gen) labels = list(labels_gen) mi = mutual_info_classif(matrix, labels) # dump data for f_name, f_mi in zip(matrix_convertor.all_fs, mi): data.print(f'{f_name} {f_mi}') data.set_statfile(f'statistics') first_run: bool = True while True: train_size: int \ = int(datasize - datasize / experiments['config']['chunks']) train_size_log: int = int(ceil(log2(train_size)) + 1) data.max_tfidf = experiments['config']['max_tfidf'] data.max_ngrams = experiments['config']['max_ngrams'] for ex in experiments['tasks']: # convert features to set: features: Set[FeatureSetEnum] \ = {FeatureSetEnum[f] for f in ex['features']} train_set = data.get_feature_dict(SampleTypeEnum.TRAIN, features, ex['extra_data']) test_set = data.get_feature_dict(SampleTypeEnum.TEST, features, ex['extra_data']) if first_run: unique_features: set = set() for inst in train_set: unique_features = unique_features.union(set( inst[0].keys())) data.print( f'Number of unique features for {ex["name"]}: {len(unique_features)}' ) unique_features = set() l_curves = experiments['config']['l_curves'] start_size: int = 1 if l_curves \ else train_size_log-1 for t_size in map(lambda x: min(2**x, train_size), range(start_size, train_size_log)): if l_curves: train_set_copy = train_set[:t_size] test_set_copy = test_set[:] else: train_set_copy = train_set test_set_copy = test_set # preprocess data for pp in ex['preprocessing']: prep: PreprocessorBase \ = getattr(preprocessors, pp).Preprocessor(ex['config']) train_set_copy = prep.process(train_set_copy, SampleTypeEnum.TRAIN) test_set_copy = prep.process(test_set_copy, SampleTypeEnum.TEST) if first_run and hasattr(train_set[0][0], 'keys'): unique_features: set = set() for inst in train_set: unique_features = unique_features.union( set(inst[0].keys())) data.print( f'Number of unique features after preprocessing for {ex["name"]}: {len(unique_features)}' ) unique_features = set() cls: ClassifierBase \ = getattr(classifiers, ex['classificator']).Classifier(ex['config']) cls.train(train_set_copy) evaluation: dict \ = compute_evaluation_scores(cls, test_set_copy, LikeTypeEnum.USEFUL) stats.add_points(len(train_set_copy), ex['name'], evaluation) if l_curves: evaluation: dict \ = compute_evaluation_scores(cls, train_set_copy, LikeTypeEnum.USEFUL) stats.add_points(len(train_set_copy), ex['name'] + '-train', evaluation) first_run = False if not data.prepare_next_dataset(): break # aggregate results here for g in experiments['graphs']: stats.name = g['name'] stats.set_view(g['data']) data.plot(stats)
ranks.append(rank + 1) for hits_level in range(10): if rank <= hits_level: hits[hits_level].append(1.0) else: hits[hits_level].append(0.0) logger.info('Hits @10: {0}'.format(np.mean(hits[9]))) logger.info('Hits @3: {0}'.format(np.mean(hits[2]))) logger.info('Hits @1: {0}'.format(np.mean(hits[0]))) logger.info('Mean rank: {0}'.format(np.mean(ranks))) logger.info('Mean reciprocal rank: {0}'.format( np.mean(1. / np.array(ranks)))) if __name__ == '__main__': # Load data data = Data(dataset='WN18', reverse=True) # Intialise model hypER = HyperER(len(data.entities), len(data.relations)) # intialise build trainer = Train(hypER, data, num_epoch=100) trainer.train_and_eval() # trainer.evaluate() # trainer.test()
def __init__(self, path, batch_size): self.data_generator = Data(path, batch_size) self.norm_adj = self.data_generator.get_adj_mat()
def __init__(self, data, train = False): self.data = data if data in ['imdbcnn']: filters = 250 hidden_dims = 250 self.embedding_dims = 50 self.maxlen = 400 self.num_classes = 2 self.num_words = 20002 self.type = 'word' if not train: K.set_learning_phase(0) X_ph = Input(shape=(self.maxlen,), dtype='int32') emb_layer = Embedding(self.num_words, self.embedding_dims, input_length=self.maxlen, name = 'embedding_1') emb_out = emb_layer(X_ph) if train: preds = construct_original_network(emb_out, data) else: emb_ph = Input(shape=(self.maxlen,self.embedding_dims), dtype='float32') preds = construct_original_network(emb_ph, data) if not train: model1 = Model(X_ph, emb_out) model2 = Model(emb_ph, preds) pred_out = model2(model1(X_ph)) pred_model = Model(X_ph, pred_out) pred_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) self.pred_model = pred_model grads = [] for c in range(self.num_classes): grads.append(tf.gradients(preds[:,c], emb_ph)) grads = tf.concat(grads, axis = 0) # [num_classes, batchsize, maxlen, embedding_dims] approxs = grads * tf.expand_dims(emb_ph, 0) # [num_classes, batchsize, maxlen, embedding_dims] self.sess = K.get_session() self.grads = grads self.approxs = approxs self.input_ph = X_ph self.emb_out = emb_out self.emb_ph = emb_ph weights_name = 'original.h5'#[i for i in os.listdir('imdblstm/models/') if i.startswith('original')][0] model1.load_weights('{}/models/{}'.format(data, weights_name), by_name=True) model2.load_weights('{}/models/{}'.format(data, weights_name), by_name=True) print('Model constructed.') # For validating the data. emb_weights = emb_layer.get_weights() emb_weights[0][0] = np.zeros(50) emb_layer.set_weights(emb_weights) else: pred_model = Model(X_ph, preds) pred_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) self.pred_model = pred_model from load_data import Data dataset = Data(self.data) self.train(dataset) print('Training is done.')
def main(args): a4a = Data("a4a", A4A_FEATURES) a4a_testing = Data("a4a.t", A4A_FEATURES) iris = Data("iris.scale", IRIS_FEATURES) iris_testing = Data("iris.t", IRIS_FEATURES) if not args.algorithm: raise AssertionError("Please specify which ML Algorithm you would like to use with -a. Exiting...") if args.algorithm == 'perceptron' or args.algorithm == 'Perceptron': # Can specify max_lrate and max_epochs with -l and -e if not args.lrate: max_lrate = 0.1 else: max_lrate = args.lrate if not args.epochs: epochs = 1000 else: epochs = args.epochs if args.verbose: print("Beginning perceptron categorization... \n") for lrate in np.arange(0, max_lrate, 0.001): # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- # init_w_iris = [1 for _ in range(IRIS_FEATURES)] if args.verbose: print("\nIRIS:\n") """ Perceptron makes two passes for multi-classification. First pass sets datapoints with labels 2 or 3 as -1, and classifies a point as either 1, or 2/3. Second pass distinguishes between 2 and 3 by comparing them alone. This is kind of messy tbh. """ iris_y_second_pass = [] iris_x_second_pass = [] for i in range(len(iris.y)): if iris.y[i] == 1: continue elif iris.y[i] == 2: iris.y[i] = -1 iris_x_second_pass.append(iris.x[i]) iris_y_second_pass.append(-1) elif iris.y[i] == 3: iris.y[i] = -1 iris_x_second_pass.append(iris.x[i]) iris_y_second_pass.append(1) p2 = Perceptron(iris_testing.x[0], init_w_iris, bias=1) p2.train_weights(iris.x, iris.y, lrate=lrate, epochs=epochs, verbose=args.verbose) p3 = Perceptron(iris_testing.x[0], init_w_iris, bias=1) p3.train_weights(iris_x_second_pass, iris_y_second_pass, lrate=lrate, epochs=epochs, verbose=args.verbose) iris_error = 0 iris_start_time = time.time() for j in range(len(iris_testing.x)): p2.set_x(iris_testing.x[j]) prediction = p2.predict() if args.verbose: print(f"Prediction for {iris_testing.x[j]}: {prediction}. Recorded classification is {iris_testing.y[j]}") if prediction == 1 and iris_testing.y[j] == 1: iris_error += 1 elif prediction == -1 and (iris_testing.y[j] == 2 or iris_testing.y[j] == 3): iris_error += 1 for k in range(len(iris_testing.x)): if iris_testing.y[k] != 1: p3.set_x(iris_testing.x[k]) prediction = p3.predict() if args.verbose: print(f"Prediction for {iris_testing.x[k]}: {prediction}. Recorded classification is {iris_testing.y[k]}") if iris_testing.y[k] == 2 and prediction == -1: iris_error += 1 elif iris_testing.y[k] == 3 and prediction == 1: iris_error += 1 iris_error = iris_error / ( len(iris_testing.y) + 10 ) iris_total_time = time.time() - iris_start_time # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ # init_w_a4a = [1 for _ in range(A4A_FEATURES)] if args.verbose: print("\nA4A:\n") p = Perceptron(a4a_testing.x[0], init_w_a4a, bias=1) p.train_weights(a4a.x, a4a.y, lrate=lrate, epochs=epochs, verbose=args.verbose) a4a_error = 0 a4a_start_time = time.time() for i in range(len(a4a_testing.x)): p.set_x(a4a_testing.x[i]) prediction = p.predict() #if args.verbose: #print(f"Prediction for {a4a_testing.x[i]}: {prediction}. Recorded classification is {a4a_testing.y[i]}") if prediction == a4a_testing.y[i]: a4a_error += 1 a4a_error = a4a_error / len(a4a_testing.y) a4a_total_time = time.time() - a4a_start_time if args.verbose: print(f"Iris misclassification error: {iris_error}\na4a misclassification error: {a4a_error}\n") print(f"Iris classification time: {iris_total_time}\na4a classification time: {a4a_total_time}") elif args.algorithm == 'kNN' or args.algorithm == 'knn': if args.verbose: print("Beginning k-Nearest Neighbors categorization... \n") # can specify k and distance with -k and -d if not args.k: max_k=25 else: max_k = args.k if not args.distance: distance_metric = 'euclidean' else: distance_metric = args.distance for k in range(1, max_k): # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- # iris_knn = kNN(iris.x, iris.y) iris_error = 0 iris_start_time = time.time() for i in range(len(iris_testing.x)): y = iris_knn.classify(new_x=iris_testing.x[i], k=k, distance_metric=distance_metric, verbose=args.verbose) if args.verbose: print(f"Prediction for {iris_testing.x[i]}: {y}. Recorded classification is {iris_testing.y[i]}") if y == iris_testing.y[i]: iris_error += 1 iris_error = iris_error / len(iris_testing.y) iris_total_time = time.time() - iris_start_time # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ # a4a_knn = kNN(a4a.x, a4a.y) a4a_error = 0 a4a_start_time = time.time() for j in range(len(a4a_testing.x)): y = a4a_knn.classify(new_x=a4a_testing.x[j], k=k, distance_metric=distance_metric, verbose=args.verbose) if args.verbose: print(f"Prediction for {a4a_testing.x[j]}: {y}. Recorded classification is {a4a_testing.y[j]}") if y == a4a_testing.y[j]: a4a_error += 1 a4a_error = a4a_error / len(a4a_testing.y) a4a_total_time = time.time() - a4a_start_time if args.verbose: print(f"Iris misclassification error: {iris_error}\na4a misclassification error: {a4a_error}\n") print(f"Iris classification time: {iris_total_time}\na4a classification time: {a4a_total_time}") elif args.algorithm == 'decision' or args.algorithm == 'tree' or args.algorithm == 'decision_tree': # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- # iris_dt = DecisionTree(iris.x, iris.y) #left_x, left_y, right_x, right_y = iris_dt.split(0, 0) # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ # a4a_dt = DecisionTree(a4a.x, a4a.y) print("This part made optional, therefore not implemented for the sake of time. ") return
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default="ComConV", help='ComConV') parser.add_argument( '--dataset', type=str, default="countries/countries_S3", help= 'FB15k, FB15k-237, WN18, WN18RR, YAGO3-10, countries/countries_S1, ...' ) parser.add_argument('--cuda', type=bool, default=False, help='use cuda or not') parser.add_argument('--get_best_results', type=bool, default=True, help='get best results or not') parser.add_argument('--get_complex_results', type=bool, default=False, help='get complex results or not') parser.add_argument('--num_to_eval', type=int, default=5, help='number to evaluate') # learning parameters parser.add_argument('--learning_rate', type=float, default=1e-1, help='learning rate') parser.add_argument('--batch_size', type=int, default=128, help='batch size') parser.add_argument('--num_iterations', type=int, default=1500, help='iterations number') parser.add_argument('--optimizer_method', type=str, default="RAdam", help='optimizer method') parser.add_argument('--decay_rate', type=float, default=1.0, help='decay rate') parser.add_argument('--label_smoothing', type=float, default=0.1, help='label smoothing') # convolution parameters parser.add_argument('--ent_vec_dim', type=int, default=200, help='entity vector dimension') parser.add_argument('--rel_vec_dim', type=int, default=200, help='relation vector dimension') parser.add_argument('--input_dropout', type=float, default=0.2, help='input dropout') parser.add_argument('--feature_map_dropout', type=float, default=0.2, help='feature map dropout') parser.add_argument('--hidden_dropout', type=float, default=0.3, help='hidden dropout') parser.add_argument('--filt_h', type=int, default=2, help='filter height') parser.add_argument('--filt_w', type=int, default=5, help='filter width') parser.add_argument('--in_channels', type=int, default=1, help='in channels') parser.add_argument('--out_channels', type=int, default=36, help='out channels') args = parser.parse_args() dataset = args.dataset data_dir = "data/%s/" % dataset print(args) # 通过设置随机数种子,固定每一次的训练结果 seed = 777 np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True data = Data(data_dir=data_dir, reverse=True) run = RunModel(data, modelname=args.model_name, optimizer_method=args.optimizer_method, num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.learning_rate, decay_rate=args.decay_rate, ent_vec_dim=args.ent_vec_dim, rel_vec_dim=args.rel_vec_dim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout=args.hidden_dropout, feature_map_dropout=args.feature_map_dropout, in_channels=args.in_channels, out_channels=args.out_channels, filt_h=args.filt_h, filt_w=args.filt_w, label_smoothing=args.label_smoothing, num_to_eval=args.num_to_eval, get_best_results=args.get_best_results, get_complex_results=args.get_complex_results, regular_method="", regular_rate=1e-4) run.train_and_eval()
# pkl_file = './datasets/dblp10000/VertexClustering.pkl' # attr1_file = './datasets/dblp10000/gender.txt' # # attr2_file = './datasets/dblp10000/prolific.txt' # attr3_file = './datasets/dblp10000/topic.txt' # edges_file = './datasets/dblp84170/edgelist_py.txt' # pkl_file = './datasets/dblp84170/VertexClustering.pkl' # attr1_file = './datasets/dblp84170/prolific.txt' # attr2_file = './datasets/dblp84170/topic.txt' # edges_file = './datasets/AmazonLarge/edgelist_py.txt' # pkl_file = './datasets/AmazonLarge/VertexClustering.pkl' # attr1_file = './datasets/AmazonLarge/avg_rating.txt' # attr2_file = './datasets/AmazonLarge/sales_rank.txt' data = Data() data.read_graph(edges_file) data.load_clusters(pkl_file) # data.detect_clusters(stru_method='lpa') # data.detect_clusters(stru_method='infomap') data.read_attr(attr1_file) data.read_attr(attr2_file) data.read_attr(attr3_file) coho = Cohomo(data) del data coho.init_attr_weight() coho.update_attr_weight()
data_file = file_path + 'data/sports-600.npy' label_file = file_path + 'data/labels.npy' data = np.load(data_file) label = np.load(label_file) # load original tweets # --------------------------------------------------------------------------------- sports_dic = { 'basketball': 1, 'hockey': 2, 'baseball': 3, 'tennis': 4, 'volleyball': 5 } sp_data = Data(sports_dic, file_path) sp_df = sp_data.csv_df(['text']) # load data rm_hashtags = ['#' + s for s in sports_dic.keys()] sp_data.pre_process(sp_df, rm_list=rm_hashtags) # pre-process data sp_df.drop(['tokenized'], axis=1, inplace=True) # --------------------------------------------------------------------------------- # set up lstm structure n_classes = 5 hm_epochs = 20 batch_size = 50 chunk_size = data.shape[2] n_chunks = data.shape[1] rnn_size = 300 # height x width
# Algo config num_folds = 7 # File config VERSION = 4 MODEL_NAME = 'lgbm' OUTPUT_FOLDER = 'model_outputs/{}_{}/'.format(MODEL_NAME, VERSION) OUTPUT_FILENAME = OUTPUT_FOLDER + MODEL_NAME if not os.path.exists(OUTPUT_FOLDER): os.makedirs(OUTPUT_FOLDER) if __name__ == "__main__": print('Loading Data...') df = Data().read_data() df_x = df.drop(['target'], axis=1) df_y = df[['target']] features = list(df_x.columns.values) X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, shuffle=False, train_size=0.8) del df gc.collect() folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) oof = np.zeros(X_train.shape[0]) getVal = np.zeros(X_train.shape[0])
type=float, default=0.5, help='Dropout rate (1 - keep probability).') args = parser.parse_args() for arg in vars(args): print('{0} = {1}'.format(arg, getattr(args, arg))) torch.manual_seed(args.seed) # training on the first GPU if not on CPU device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('Training on device = {}'.format(device)) """ =========================================================================== Loading data =========================================================================== """ data = Data(path=args.data_path, dataset=args.dataset, split=args.split) print('Loaded {0} dataset with {1} nodes and {2} edges'.format( args.dataset, data.n_node, data.n_edge)) feature = data.feature.to(device) label = data.label.to(device) train = Dataset(data.idx_train) val = Dataset(data.idx_val) test = Dataset(data.idx_test) train_loader = DataLoader(dataset=train, batch_size=args.batch_size) val_loader = DataLoader(dataset=val, batch_size=args.batch_size) test_loader = DataLoader(dataset=test, batch_size=args.batch_size) sampler = Sampler(data.adj, args.aggregator) """ =========================================================================== Training ===========================================================================
bestNumLSTMUnits = numLSTMUnits print('BEST: ( num_lstm_units: ' + str(bestNumLSTMUnits) + ')') # MAIN # . # . # . # . # . # . # . # . Data.load_data(news_per_hour=10, momentum_window=30, newsTimeToMarket=20) (X_train, Y_train), (X_test, Y_test) = Data.get_train_test_set() test_x = tf.convert_to_tensor(np.asarray(X_test), dtype=tf.float32) train_x = tf.convert_to_tensor(np.asarray(X_train), dtype=tf.float32) train_y = tf.convert_to_tensor(np.asarray(Y_train), dtype=tf.float32) test_y = tf.convert_to_tensor(np.asarray(Y_test), dtype=tf.float32) print('.........................') print("number of training examples = " + str(train_x.shape[0])) print("number of test examples = " + str(test_x.shape[0])) print("X_train shape: " + str(train_x.shape)) print("Y_train shape: " + str(train_y.shape)) print("X_test shape: " + str(test_x.shape))
'--dataset', type=str, default="FB15k-237", nargs="?", help='Which dataset to use: FB15k, FB15k-237, WN18 or WN18RR') args = parser.parse_args() model_name = args.algorithm dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True seed = 42 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=True) experiment = Experiment(model_name, num_iterations=800, batch_size=128, learning_rate=0.001, decay_rate=0.99, ent_vec_dim=200, rel_vec_dim=200, cuda=True, input_dropout=0.2, hidden_dropout=0.3, feature_map_dropout=0.2, in_channels=1, out_channels=32, filt_h=1, filt_w=9,
parser.add_argument('--model', type=str, default="p2v-l", nargs="?", help='Which model to use: p2v-l or p2v-p') parser.add_argument('--num_iters', type=int, default=100, nargs="?", help='Number of iterations') parser.add_argument('--lr', type=float, default=0.1, nargs="?", help='Initial learning rate') parser.add_argument('--dr', type=float, default=0.98, nargs="?", help='Decay rate') parser.add_argument('--batch_size', type=int, default=10000, nargs="?", help='Batch size') parser.add_argument('--num_neg', type=int, default=5, nargs="?", help='Number of negative samples per each positive sample') parser.add_argument('--dim', type=int, default=200, nargs="?", help='Embeddings dimensionality') parser.add_argument('--w_reg', type=float, default=0.5, nargs="?", help='Regularization coefficient for W') parser.add_argument('--c_reg', type=float, default=0.5, nargs="?", help='Regularization coefficient for C') parser.add_argument('--cuda', type=bool, default=True, nargs="?", help='Whether to use cuda (GPU) or not (CPU)') args = parser.parse_args() d = Data(data_dir="data/", fname=args.dataset, min_occurrences=args.min_occurrences, window_size=args.window_size, subsample=args.subsample, t=args.threshold, cutoff=args.cutoff) experiment = Experiment(args.model, num_iterations=args.num_iters, learning_rate=args.lr, batch_size=args.batch_size, corrupt_size=args.num_neg, decay_rate=args.dr, embeddings_dim=args.dim, w_reg=args.w_reg, c_reg=args.c_reg, cuda=args.cuda) experiment.train_and_eval()
help="Entity embedding dimensionality.") parser.add_argument("--rdim", type=int, default=200, nargs="?", help="Relation embedding dimensionality.") parser.add_argument("--cuda", type=bool, default=True, nargs="?", help="Whether to use cuda (GPU) or not (CPU).") parser.add_argument("--input_dropout", type=float, default=0.3, nargs="?", help="Input layer dropout.") parser.add_argument("--hidden_dropout1", type=float, default=0.4, nargs="?", help="Dropout after the first hidden layer.") parser.add_argument("--hidden_dropout2", type=float, default=0.5, nargs="?", help="Dropout after the second hidden layer.") parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?", help="Amount of label smoothing.") args = parser.parse_args() dataset = args.dataset data_dir = "data/%s/" % dataset torch.backends.cudnn.deterministic = True seed = 20 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed_all(seed) d = Data(data_dir=data_dir, reverse=False) experiment = Experiment(num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.lr, decay_rate=args.dr, ent_vec_dim=args.edim, rel_vec_dim=args.rdim, cuda=args.cuda, input_dropout=args.input_dropout, hidden_dropout1=args.hidden_dropout1, hidden_dropout2=args.hidden_dropout2, label_smoothing=args.label_smoothing) experiment.train_and_eval()
parser.add_argument('--data', type = str, choices = ['imdbcnn'], default = 'imdbcnn') parser.add_argument('--num_neighbors', type = int, default = 4) parser.add_argument('--train', action='store_true') parser.add_argument('--original', action='store_true') parser.add_argument('--max_order', type = int, default = 16) args = parser.parse_args() dict_a = vars(args) if args.method == 'train': model = TextModel(args.data, train = True) else: print('Loading dataset...') dataset = Data(args.data) print('Creating model...') model = TextModel(args.data) dict_a.update({'dataset': dataset, 'model': model}) if args.data not in os.listdir('./'): os.mkdir(args.data) if 'results' not in os.listdir('./{}'.format(args.data)): os.mkdir('{}/results'.format(args.data)) if args.method in ['localshapley','connectedshapley']: dict_a.update({'regression': False}) scores = lcshapley(args)