def _commit(confirmed_, unconfirmed_): from utils import load_data from utils import save_data auditing = load_data(_params['filenames'][5]) confirmed = load_data(_params['filenames'][6]) if not confirmed: confirmed = [] unconfirmed = load_data(_params['filenames'][7]) if not unconfirmed: unconfirmed = [] i = 0 while i < len(auditing): if auditing[i]['matching'][0]['venue_id'] in confirmed_: auditing[i]['status'] = 'confirmed' a = auditing.pop(i) confirmed.append(a) i -= 1 elif auditing[i]['matching'][0]['venue_id'] in unconfirmed_: auditing[i]['status'] = 'unconfirmed' a = auditing.pop(i) unconfirmed.append(a) i -= 1 i += 1 save_data([a[0] for a in auditing], _params['filenames'][5]) save_data(auditing, _params['filenames'][5]) save_data(confirmed, _params['filenames'][6]) save_data(unconfirmed, _params['filenames'][7])
def evaluate(self, goldp = None, silverp = None, gold_data = None, silver_data = None, print_score = True): """ * Compares two syllabified lists in string format (e.g. ser-uaes): gold = ground truth silver = as predicted by system * Both lists can be passed as lists (`gold_data`, `silver_data`) or can be loaded from files (`goldp`, `silverp`). * Will return the token-level accuracy and hyphenation accuracy of the silver predictions (will print these if `print_score` is True). """ if goldp: gold_data = utils.load_data(goldp) if silverp: silver_data = utils.load_data(silverp) _, gold_Y = self.vectorize(gold_data) _, silver_Y = self.vectorize(silver_data) token_acc, hyphen_acc = utils.metrics(utils.pred_to_classes(gold_Y), utils.pred_to_classes(silver_Y)) if print_score: print('\t- evaluation scores:') print('\t\t + token acc:', round(token_acc, 2)) print('\t\t + hyphen acc:', round(hyphen_acc, 2)) return token_acc, hyphen_acc
def run(): # load in members, orient by bioguide ID print("Loading current legislators...") current = load_data("legislators-current.yaml") current_bioguide = { } for m in current: if "bioguide" in m["id"]: current_bioguide[m["id"]["bioguide"]] = m # remove out-of-office people from current committee membership print("Sweeping committee membership...") membership_current = load_data("committee-membership-current.yaml") for committee_id in list(membership_current.keys()): for member in membership_current[committee_id]: if member["bioguide"] not in current_bioguide: print("\t[%s] Ding ding ding! (%s)" % (member["bioguide"], member["name"])) membership_current[committee_id].remove(member) save_data(membership_current, "committee-membership-current.yaml") # remove out-of-office people from social media info print("Sweeping social media accounts...") socialmedia_current = load_data("legislators-social-media.yaml") for member in list(socialmedia_current): if member["id"]["bioguide"] not in current_bioguide: print("\t[%s] Ding ding ding! (%s)" % (member["id"]["bioguide"], member["social"])) socialmedia_current.remove(member) save_data(socialmedia_current, "legislators-social-media.yaml")
def main(state_num): matches_filename = 'matches_%d' % state_num print 'Loading %s ...' % matches_filename matches = utils.load_data(matches_filename) matches_reduced_filename = 'matches_reduced' try: print "Loading matches_reduced ..." matches_reduced = utils.load_data(matches_reduced_filename) except: print "Matches_reduced doesn't exists, creating new." matches_reduced = {} num_matches = len(matches.keys()) for keyIdx, matchId in enumerate(matches.keys()): print "\rMatch %d out of %d [%0.1f%%]" % (keyIdx + 1, num_matches, (keyIdx + 1) / float(num_matches) * 100), summoners = [] num_summoners = len(matches[matchId]['participants']) for i in range(num_summoners): champLevel = matches[matchId]['participants'][i]['stats']['champLevel'] summonerId = matches[matchId]['participantIdentities'][i]['player']['summonerId'] winner = matches[matchId]['participants'][i]['stats']['winner'] summoners += [{'champLevel': champLevel, 'summonerId': summonerId, 'winner': winner}] matches_reduced[matchId] = {'summoners': summoners} print "Saving %s ..." % matches_reduced_filename utils.save_data(matches_reduced, matches_reduced_filename) print "Done!"
def main(): parser = argparse.ArgumentParser(description='Generate input files for hunalign') parser.add_argument('ja', help='tokenized ja json') parser.add_argument('en', help='tokenized en json') parser.add_argument('prefix', help='output prefix') parser.add_argument('batchfile', help='output batchfile') parser.add_argument('--b', help='approximate batch size', type=int, default=5000) args = parser.parse_args() recipes_ja = utils.load_data(args.ja) recipes_en = utils.load_data(args.en) iteration = 1 langs = ('ja', 'en') num_lines = [0 for _ in langs] # keep track of the number of lines printed out output_filenames = [get_filename(args.prefix, iteration, lang) for lang in langs] output_files = [open(filename, 'w') for filename in output_filenames] batchfile_output = [(output_filenames[0], output_filenames[1], get_filename(args.prefix, iteration, 'align'))] for recipes in izip(recipes_ja, recipes_en): for index, lang in enumerate(langs): recipe = recipes[index] output_file = output_files[index] print_item_to_file(output_file, recipe['name']) print_items_to_file(output_file, recipe['description']) print_itemss_to_file(output_file, recipe['instructions']) print_items_to_file(output_file, recipe['advice']) print_items_to_file(output_file, recipe['history']) num_lines[index] += (1 + # name len(recipe['description']) + sum(map(lambda inst: len(inst), recipe['instructions'])) + len(recipe['advice']) + len(recipe['history'])) if any(map(lambda num_line: num_line > args.b, num_lines)): for output_file in output_files: output_file.close() # reset iteration += 1 num_lines = [0 for _ in langs] output_filenames = [get_filename(args.prefix, iteration, lang) for lang in langs] output_files = [open(filename, 'w') for filename in output_filenames] batchfile_output.append((output_filenames[0], output_filenames[1], get_filename(args.prefix, iteration, 'align'))) for output_file in output_files: output_file.close() with open(args.batchfile, 'w') as f: for output in batchfile_output: f.write('{0}\t{1}\t{2}\n'.format(*output))
def main(): train_url = "e:/data/comment_sentiment/train_set.csv" test_url = "e:/data/comment_sentiment/test_set.csv" x, y = utils.load_data(train_url) test_x, test_y = utils.load_data(test_url) lr = LogisticRegression() lr.train_model(x, y) pre_y = lr.predict(test_x) utils.show_result(test_y, pre_y, "logistic_comment")
def main(): train_url = "train_set.csv" test_url = "test_set.csv" x, y = utils.load_data(train_url) test_x, test_y = utils.load_data(test_url) gda = GDA() gda.train_model(x, y) pre_y = gda.predict(test_x) utils.show_result(test_y, pre_y, "gda_comment")
def main(): train_url = "e:/data/comment_sentiment/train_set.csv" test_url = "e:/data/comment_sentiment/test_set.csv" train_x, train_y = utils.load_data(train_url) test_x, test_y = utils.load_data(test_url) bayes = Bayes() bayes.train_model(train_x, train_y) pre_y = bayes.predict(test_x) utils.show_result(test_y, np.array([pre_y]).T, "bayes_comment")
def process_categories(): reviews = load_data("vegas_reviews.json") def pizza(line): return 'Pizza' in line['categories'] vegas_cats = set(food_lib.map_to_arg(load_data("food_businesses.json", pizza), 'business_id')) times = [] for r in reviews: if r['business_id'] in vegas_cats: times.append(r['date']) print(sorted(times))
def _load_dbs(): global user_dict global movie_dict try: movie_dict = load_data(movie_pkl_file_name) except IOError: print 'there is no pkl file named %s' % movie_pkl_file_name try: user_dict = load_data(user_pkl_file_name) except IOError: print 'there is no pkl file named %s' % user_pkl_file_name
def create_sorted_dict(self, offline=False): if offline: # Offline c*k zaman aliyor, gerek yok gibi bir sey. #TODO Try-except koymak lazim, dosyalar yok belki self.cf_simsorted_dict = load_data(PKL + 'cf_simsorted.pkl') self.cb_simsorted_dict = load_data(PKL + 'cb_simsorted.pkl') else: if self.cb_prox is None or self.cf_prox is None: self.create_proximity_matrices() self.cb_simsorted_dict = utils.sortSparseMatrix(self.cb_prox) print "cb dict has been calculated" self.cf_simsorted_dict = utils.sortSparseMatrix(self.cf_prox)
def main(): parser = argparse.ArgumentParser(description='Check whether ja-en recipes ' 'have same number of ingredients') parser.add_argument('ja', help='sorted ja recipes') parser.add_argument('en', help='sorted en recipes') args = parser.parse_args() recipes_ja = utils.load_data(args.ja) recipes_en = utils.load_data(args.en) for recipe_ja, recipe_en in izip(recipes_ja, recipes_en): assert(recipe_ja['id'] == recipe_en['id']) if len(recipe_ja['ingredients']) != len(recipe_en['ingredients']): print(recipe_ja['id'])
def main(args): '''Module main function''' global database global genetic_algorithm global joint_positions global goal_positions pygame.init() random.seed() database = utils.initialize_database(args, 'RobotTrainingData') database.set_objective_names(['Tiempo', r'Error en $\theta_1$', r'Error en $\theta_2$', r'Error en $\theta_3$', 'Energía']) problem = EV3Problem() generation = database.properties['highest_population'] population_size = database.properties['population_size'] genetic_algorithm = evolution.NSGA(problem, population_size) x_path = os.path.abspath(pkg_resources.resource_filename('resources.ev3', 'x_train.txt')) y_path = os.path.abspath(pkg_resources.resource_filename('resources.ev3', 'y_train.txt')) batch_start = (generation % 10) * N_GOALS joint_positions = np.loadtxt(x_path)[batch_start : batch_start + N_GOALS, :] goal_positions = np.loadtxt(y_path)[batch_start : batch_start + N_GOALS, :] if generation > 0: parents, children = utils.load_data(database) genetic_algorithm.set_population(parents) genetic_algorithm.set_children(children) for _ in range(args.iterations): generation += 1 print('Starting generation ' + str(generation)) genetic_algorithm.iterate() database.create_population() utils.save_data(genetic_algorithm, database) print('=' * (SCREEN_WIDTH - 1))
def compute_pval_rsa(seed): stim, voxels = load_data(n_samples, n_features, model=model, seed=seed, heteroscedastic=heteroscedastic) # compute similarity stim_ = stim if stim.shape[1] == 1: stim_ = np.hstack((stim, - stim)) stim_similarity = square_pdist(stim_) # np.corrcoef(stim_) voxels_similarity = square_pdist(voxels) # np.corrcoef(voxels) # indices to extract lower triangular part of a matrix lw_idx = np.triu_indices(n_samples, k=1) stim_vsim = stim_similarity[lw_idx] voxels_vsim = voxels_similarity[lw_idx] # compute the statistic # T = np.corrcoef(stim_vsim, voxels_vsim)[0, 1] T = spearmanr(voxels_vsim, stim_vsim)[0] T_perm = [] for i in range(n_draws): # permute the labels perm = np.random.permutation(n_samples) # voxels_vsim_perm = np.corrcoef(voxels[perm])[lw_idx] voxels_vsim_perm = square_pdist(voxels[perm])[lw_idx] # compute the test statistic # T_perm.append(np.corrcoef(voxels_vsim_perm, stim_vsim)[0, 1]) T_perm.append(spearmanr(voxels_vsim_perm, stim_vsim)[0]) pval = 1 - percentileofscore(np.array(T_perm), T) / 100. return pval
def t1_3(): data = utils.load_data('data2D.npy').astype("float32") for k in [3]: rvals = kmeans(data, 1e-3, k, epochs=1000) t_loss = rvals['training_loss'] v_loss = rvals['validation_loss'] mu = rvals['mu'] plt.clf() fig = plt.figure(1, figsize=(16,12)) plt.plot(np.arange(len(t_loss)), t_loss) plt.savefig("t12_2_k%d.png" % k) t = classify(data, mu) colors = iter(cm.rainbow(np.linspace(0, 1, len(t)))) plt.clf() #fig = plt.figure(1, figsize=(16,12)) for i in range(len(t)): print 'plotting scatter...' print 'cluster x, y shape ', t[i][:, 0].shape, t[i][:, 1].shape print 'cluster x, y shape ', t[i][:, 0].shape, t[i][:, 1].shape s = plt.scatter(t[i][:, 0], t[i][:, 1], color=next(colors)) #print "returned ", s plt.show() plt.savefig('t12_3_scatter_k%d.png' % (i))
def main(): if len(sys.argv) != 2: print 'Usage:\t./next_day_prediction.py TICKER_SYMBOL' print 'Ex:\t./next_day_prediction.py NFLX' sys.exit() k = 10 D = utils.load_data('i30/stocks/' + sys.argv[1] + '.csv') #load in past year's data train = D[-365:] X_train, y_train = utils.timestep_transform(train, k) model, scaler = utils.generate_model(X_train, y_train) pred_val = model.predict(scaler.transform([utils.day_transform(D, k)])).tolist() print 'Current day closing value:' print '\t', D[-1][-2] print 'Projected change in closing value:' print '\t', 100*(pred_val[0]-1) print 'Project next day closing value:' print '\t', pred_val[0]*D[-1][-2]
def _load_statistics(): global _statistics filename = get_path('datasets' , 'statistics.json') _statistics = load_data(filename, verbose=False) if not _statistics or not isinstance(_statistics, dict) : _statistics = {} print ('')
def main(): parser = argparse.ArgumentParser(description='Tokenize all') parser.add_argument('recipes', help='recipes.json') parser.add_argument('--lang', choices=('en', 'ja')) args = parser.parse_args() recipes = utils.load_data(args.recipes) for recipe in recipes: name = word_tokenize(recipe['name'], args.lang) description = sent_word_tokenize(recipe['description'], args.lang) ingredients_name = map(lambda ing_name: word_tokenize(ing_name, args.lang), map(lambda ing: ing['name'], recipe['ingredients'])) ingredients_quantity = map(lambda ing_qt: word_tokenize(ing_qt, args.lang), map(lambda ing: ing['quantity'], recipe['ingredients'])) ingredients = map(lambda pair: {'name': pair[0], 'quantity': pair[1]}, zip(ingredients_name, ingredients_quantity)) instructions = map(lambda inst: sent_word_tokenize(inst, args.lang), recipe['instructions']) advice = sent_word_tokenize(recipe['advice'], args.lang) history = sent_word_tokenize(recipe['history'], args.lang) recipe = { 'id': recipe['id'], 'name': name, 'description': description, 'ingredients': ingredients, 'instructions': instructions, 'advice': advice, 'history': history, } print(json.dumps(recipe))
def next_day_prediction(ticker_symbol, training_days): k = 10 D = utils.load_data('i30/stocks/' + ticker_symbol + '.csv') #load in past year's data train = D[-1*training_days:] X_train, y_train = utils.timestep_transform(train, k) model, scaler = utils.generate_model(X_train, y_train) pred_val = model.predict(scaler.transform([utils.day_transform(D, k)])).tolist() curr_close = D[-1][-2] change = pred_val[0]-1 next_close = pred_val[0]*curr_close ''' print 'Current day closing value:' print '\t', curr_close print 'Projected change in closing value:' print '\t', 100*change print 'Project next day closing value:' print '\t', next_close ''' return (curr_close, change, next_close)
def train(in_file): xvals, yvals = utils.load_data(in_file) xvals, yvals = utils.randomize(xvals, yvals) network = build_network() model = tflearn.DNN(network) model.fit(xvals, yvals, n_epoch=200, validation_set=0.2) model.save('circle.tflearn')
def run(): options = utils.flags() debug = options.get('debug', False) filename = "legislators-current.yaml" args = utils.args() legislators = load_data(filename) if len(args) != 0: bioguides = args print("Fetching contact forms for %s..." % ', '.join(bioguides)) else: bioguides = [member['id']['bioguide'] for member in legislators] print("Fetching contact forms for all current members...") for legislator in legislators: bioguide = legislator['id']['bioguide'] if bioguide not in bioguides: continue if bioguide in SKIP_BIOGUIDES: continue if debug: print("Downloading form for %s" % bioguide, flush=True) try: steps = contact_steps_for(bioguide) except LegislatorNotFoundError as e: if debug: print("skipping, %s..." % e, flush=True) continue legislator['terms'][-1]['contact_form'] = steps['contact_form']['steps'][0]['visit'] print("Saving data to %s..." % filename) save_data(legislators, filename)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write(json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def stacker_data_v2(cutoff, num_least_correlated_cols): X, y_train = load_data() oof_predictions, lb_predictions, oof_ginis = load_predictions_with_cutoff(PREDICTION_PATH, cutoff) X_train, X_test = oof_predictions, lb_predictions new_cols = least_correlated_cols(X_train, num_least_correlated_cols) X_train, X_test = X_train[new_cols], X_test[new_cols] return X_train, y_train, X_test
def data_v1(): X, y_train = load_data() X = pd.get_dummies(X) is_train_obs = X.index.get_level_values('obs_type') == 'train' X_train, X_test = X[is_train_obs], X[~is_train_obs] return X_train, y_train, X_test
def test_cA(learning_rate=0.01, training_epochs=20, dataset='../datasets/mnist.pkl.gz', batch_size=10, output_folder='cA_plots', contraction_level=.1): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size index = T.lscalar() x = T.matrix('x') if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) rng = numpy.random.RandomState(123) ca = cA(numpy_rng=rng, input=x, n_visible=28 * 28, n_hidden=500, n_batchsize=batch_size) cost, updates = ca.get_cost_updates(contraction_level=contraction_level, learning_rate=learning_rate) train_ca = theano.function( [index], [T.mean(ca.L_rec), ca.L_jacob], updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) start_time = timeit.default_timer() for epoch in xrange(training_epochs): c = [] for batch_index in xrange(n_train_batches): c.append(train_ca(batch_index)) c_array = numpy.vstack(c) print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean( c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1])) end_time = timeit.default_timer() training_time = (end_time - start_time) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) image = Image.fromarray(tile_raster_images( X=ca.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('cae_filters.png') os.chdir('../')
def load_data(random_state=1066, n=1000, max_phrase_length=100): data = utils.load_data(random_state=random_state, n=n, max_phrase_length=max_phrase_length) X_train, y_train = data[0] X_valid, y_valid = data[1] X_test, y_test = data[2] X_train = X_train.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1) X_valid = X_valid.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1) X_test = X_test.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1) # Robert: what about reshaping this data for 1D convs? # hstack() instead of hstack() in when creatign X in utils? return dict( X_train=theano.shared(lasagne.utils.floatX(X_train)), y_train=T.cast(theano.shared(y_train), 'int32'), X_valid=theano.shared(lasagne.utils.floatX(X_valid)), y_valid=T.cast(theano.shared(y_valid), 'int32'), X_test=theano.shared(lasagne.utils.floatX(X_test)), y_test=T.cast(theano.shared(y_test), 'int32'), num_examples_train=X_train.shape[0], num_examples_valid=X_valid.shape[0], num_examples_test=X_test.shape[0], #input_height=X_train.shape[2], # what's the equivalent in our vectors? #input_width=X_train.shape[3], output_dim=5, # since five sentiment class )
def _match_from_models(model_filename, search_func, get_entity_by_id_func, filenames, threshold, prompt, verbose=True): from utils import load_data if verbose == True: print ('Loading models...') models = load_data(model_filename) if verbose == True: print ('Done.') models = _pre_process_models(models, filenames) _run_match(models, search_func, get_entity_by_id_func, filenames, threshold, prompt)
def data_v5(): X, y_train = load_data() X = pd.get_dummies(X) X.drop(['T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], axis=1, inplace=True) is_train_obs = X.index.get_level_values('obs_type') == 'train' X_train, X_test = X[is_train_obs], X[~is_train_obs] return X_train, y_train, X_test
def load_triangle(): """ Get the text into a 2D array of ints. """ triangle_string = utils.load_data(FILENAME) triangle_lines = [line for line in triangle_string.split('\n')] triangle = [[int(x) for x in line.split()] for line in triangle_lines] return triangle
def load_higgs_data(data_file, valid_size, normalize): # we get back a tuple of train data, test data, train weights, train labels, and test labels dataset = load_data(data_file, valid_size, encoding='integer', normalize=normalize) train_set_x, train_set_y = load_shared_dataset((dataset[0], dataset[3])) valid_set_x, valid_set_y = load_shared_dataset((dataset[1], dataset[4])) return [(train_set_x, train_set_y), (valid_set_x, valid_set_y)]
import project1 as p1 import utils import numpy as np #------------------------------------------------------------------------------- # Data loading. There is no need to edit code in this section. #------------------------------------------------------------------------------- train_data = utils.load_data('reviews_train.tsv') val_data = utils.load_data('reviews_val.tsv') test_data = utils.load_data('reviews_test.tsv') train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data)) val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data)) test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data)) dictionary = p1.bag_of_words(train_texts) train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary) val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary) test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary) #------------------------------------------------------------------------------- # Problem 5 #------------------------------------------------------------------------------- toy_features, toy_labels = toy_data = utils.load_toy_data('toy_data.tsv') T = 10 L = 0.2
batch_size = 128 epochs = 10 img_size = 224 # input image dimensions channel_size = 1 label_size = 1 # label dimensions img_dims = (img_size, img_size, channel_size) label_dims = (label_size, label_size) filepath_labels = 'lol_labels.txt' filepath_data = 'lol_images.zip' data_size = 60000 # the data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = load_data() x_train = x_train.reshape(x_train.shape[0], img_size, img_size, channel_size) x_test = x_test.reshape(x_test.shape[0], img_size, img_size, channel_size) input_shape = (img_size, img_size, channel_size) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') ###### Model 0 ############################## model = Sequential() model.add(
def load_normalize_data(path): data, label = load_data(path) processed_data = data_preprocessing(data) return processed_data, label
y_batch = y_batch.to(device=args.device) out = net(P, roots, X_batch, X_batch_daily, X_batch_weekly, r2sDic, s2rDic, randomtrajs, mask1) loss = masked_mae_loss(out, y_batch) loss.backward() optimizer.step() epoch_training_losses.append(loss.detach().cpu().numpy()) return sum(epoch_training_losses) / len(epoch_training_losses) if __name__ == '__main__': torch.manual_seed(1) X, r2sDic, s2rDic, trajDic, keys = load_data(pathNum, pathLen) split_line1 = int(X.shape[2] * 0.7) split_line2 = int(X.shape[2] * 0.8) split_line3 = int(X.shape[2]) np.save("train_cd.npy", X[:, :, :split_line1]) np.save("val_cd.npy", X[:, :, split_line1:split_line2]) np.save("test_cd.npy", X[:, :, split_line2:]) means = np.mean(X[:, :, :split_line1], axis=(0, 2)) stds = np.std(X[:, :, :split_line1], axis=(0, 2)) X = X - means[0] X = X / stds[0] print(means) print(stds) print(X.shape)
else: return False parser = argparse.ArgumentParser() parser.add_argument('--lr', type=float, default=1e-2, help='Learning rate for the parameters') parser.add_argument('--wd', type=float, default=1e-2, help='Weight decay for the parameters') parser.add_argument('--n_hid', type=int, default=112, help='hidden layer for RNN') parser.add_argument('--n_iter', type=int, default=9, help='(time-steps + 1) for RNN') parser.add_argument('--dataset', type=str, default='cora', help='dataset, also use "citeseer" or "pubmed"') parser.add_argument('--ps', type=int, default=5, help='patience for early stopping') parser.add_argument('--d1', type=float, default=0.2, help='dropout rate for RNN') parser.add_argument('--d2', type=float, default=0.2, help='dropout rate for dense(attention)') parser.add_argument('--d3', type=float, default=0.4, help='dropout rate for dense(classification)') arg = parser.parse_args() features_, labels_, adj, deg, deg_inv = load_data(arg.dataset) P = torch.from_numpy(deg_inv.dot(adj.todense())) features = torch.from_numpy(features_.todense()) labels = torch.from_numpy(labels_).long() n_nodes, n_feats = features_.shape[0], features_.shape[1] n_class = np.int(np.max(labels_) + 1) ### Belows are the hyperparameters n_hids = arg.n_hid n_iters = arg.n_iter d1 = arg.d1 # Dropout rate for RNN d2 = arg.d2 # Dropout rate for attention d3 = arg.d3 # Dropout rate for dense(classification) n_epochs = arg.n_iter lr = arg.lr # Learning rate for the parameters wd = arg.wd # Weight decay for the parameters ps = arg.ps #Patience rate for Early Stopping
import utils n_players, max_marble = utils.load_data() print(utils.compute_max_score(n_players, max_marble))
def train(model, supervisor, num_label): trX, trY, num_tr_batch, valX, valY, num_val_batch = load_data( cfg.dataset, cfg.batch_size, is_training=True) Y = valY[:num_val_batch * cfg.batch_size].reshape((-1, 1)) fd_train_acc, fd_loss, fd_val_acc = save_to() config = tf.ConfigProto() config.gpu_options.allow_growth = True with supervisor.managed_session(config=config) as sess: print("\nNote: all of results will be saved to directory: " + cfg.results) for epoch in range(cfg.epoch): print("Training for epoch %d/%d:" % (epoch, cfg.epoch)) if supervisor.should_stop(): print('supervisor stoped!') break for step in tqdm(range(num_tr_batch), total=num_tr_batch, ncols=70, leave=False, unit='b'): start = step * cfg.batch_size end = start + cfg.batch_size global_step = epoch * num_tr_batch + step if global_step % cfg.train_sum_freq == 0: _, loss, train_acc, summary_str = sess.run([ model.train_op, model.total_loss, model.accuracy, model.train_summary ]) assert not np.isnan( loss), 'Something wrong! loss is nan...' supervisor.summary_writer.add_summary( summary_str, global_step) fd_loss.write(str(global_step) + ',' + str(loss) + "\n") fd_loss.flush() fd_train_acc.write( str(global_step) + ',' + str(train_acc / cfg.batch_size) + "\n") fd_train_acc.flush() else: sess.run(model.train_op) if cfg.val_sum_freq != 0 and ( global_step) % cfg.val_sum_freq == 0: val_acc = 0 for i in range(num_val_batch): start = i * cfg.batch_size end = start + cfg.batch_size acc = sess.run( model.accuracy, { model.X: valX[start:end], model.labels: valY[start:end] }) val_acc += acc val_acc = val_acc / (cfg.batch_size * num_val_batch) fd_val_acc.write( str(global_step) + ',' + str(val_acc) + '\n') fd_val_acc.flush() if (epoch + 1) % cfg.save_freq == 0: supervisor.saver.save( sess, cfg.logdir + '/model_epoch_%04d_step_%02d' % (epoch, global_step)) fd_val_acc.close() fd_train_acc.close() fd_loss.close()
model = DQN(state_dim, NUM_ACTIONS, NUM_OBJECTS) optimizer = optim.SGD(model.parameters(), lr=ALPHA) single_run_epoch_rewards_test = [] pbar = tqdm(range(NUM_EPOCHS), ncols=80) for _ in pbar: single_run_epoch_rewards_test.append(run_epoch()) pbar.set_description( "Avg reward: {:0.6f} | Ewma reward: {:0.6f}".format( np.mean(single_run_epoch_rewards_test), utils.ewma(single_run_epoch_rewards_test))) return single_run_epoch_rewards_test if __name__ == '__main__': state_texts = utils.load_data('game.tsv') dictionary = utils.bag_of_words(state_texts) state_dim = len(dictionary) # set up the game framework.load_game_data() epoch_rewards_test = [] # shape NUM_RUNS * NUM_EPOCHS for _ in range(NUM_RUNS): epoch_rewards_test.append(run()) epoch_rewards_test = np.array(epoch_rewards_test) x = np.arange(NUM_EPOCHS) fig, axis = plt.subplots()
def train(self, epochs, batch_size=32, sample_interval=500, start_point=0): # Load the dataset X_train, y_train = utils.load_data(self.writer) # Adversarial ground truths valid = np.ones((batch_size, 1)) fake = np.zeros((batch_size, 1)) for epoch in range(start_point, epochs): # --------------------- # Train Discriminator # --------------------- # Select a random batch of images idx = np.random.randint(0, X_train.shape[0], batch_size) imgs = X_train[idx] # Sample noise as generator input noise = np.random.normal(0, 1, (batch_size, self.latent_dim)) # The labels of the digits that the generator tries to create an # image representation of sampled_labels = np.random.uniform(0, 1, (batch_size, self.num_classes)) sampled_labels = np.around(sampled_labels) # Generate a half batch of new images gen_imgs = self.generator.predict([noise, sampled_labels]) # Image labels. 0-9 img_labels = y_train[idx] # Train the discriminator d_loss_real = self.discriminator.train_on_batch( imgs, [valid, img_labels]) d_loss_fake = self.discriminator.train_on_batch( gen_imgs, [fake, sampled_labels]) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) # --------------------- # Train Generator # --------------------- # Train the generator g_loss = self.combined.train_on_batch([noise, sampled_labels], [valid, sampled_labels]) # Plot the progress print( "%d [D loss: %f, acc.: %.2f%%, op_acc: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[3], 100 * d_loss[4], g_loss[0])) utils.write_log( self.writer, ['D loss', 'G loss', 'accuracy', 'class accuracy'], [d_loss[0], g_loss[0], 100 * d_loss[3], 100 * d_loss[4]], epoch) # If at save interval => save generated image samples if epoch % sample_interval == 0: utils.save_model( '%s/' % ('acgan' if self.flags.name is None else self.flags.name), self.generator, self.discriminator, epoch) self.sample_images(epoch)
def main(filename): """ Main function for generating submissions. """ y_pred_all = [] X_train, y_train_all, X_test = load_data() for n in range(3): print( "############## working on dataset {} ###################".format( str(n + 1))) # process y_train = 2 * np.array(y_train_all[2000 * n:2000 * (n + 1)]) - 1 k, n_mismatch = 13, 3 if n != 0: print("Compute gram matrix for first kernel") gram_train_13_3, gram_test_13_3 = get_gram_matrix( X_train[2000 * n:2000 * (n + 1)], X_test[1000 * n:1000 * (n + 1)], k=k, n_mismatch=n_mismatch, n_kernel=n + 1, ) k, n_mismatch = 12, 2 if n != 0: print("Compute gram matrix for second kernel ") gram_train_12_2, gram_test_12_2 = get_gram_matrix( X_train[2000 * n:2000 * (n + 1)], X_test[1000 * n:1000 * (n + 1)], k=k, n_mismatch=n_mismatch, n_kernel=n + 1, ) print("Compute gram matrix for third kernel ") k, n_mismatch = 13, 2 gram_train_13_2, gram_test_13_2 = get_gram_matrix( X_train[2000 * n:2000 * (n + 1)], X_test[1000 * n:1000 * (n + 1)], k=k, n_mismatch=n_mismatch, n_kernel=n + 1, ) print("Training and generating prediction") if n == 0: train_grams = [gram_train_13_2] test_grams = [gram_test_13_2] y_pred = predict_first_set(train_grams, test_grams, y_train) elif n == 1: train_grams = [gram_train_13_2, gram_train_12_2, gram_train_13_3] test_grams = [gram_test_13_2, gram_test_12_2, gram_test_13_3] y_pred = predict_second_set(train_grams, test_grams, y_train) else: train_grams = [gram_train_13_2, gram_train_12_2, gram_train_13_3] test_grams = [gram_test_13_2, gram_test_12_2, gram_test_13_3] y_pred = predict_third_set(train_grams, test_grams, y_train) y_pred = (y_pred + 1) / 2 y_pred_all += list(y_pred) print("Saving prediction in CSV file") with open(filename, "w") as csvfile: fieldnames = ["Id", "Bound"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i in tqdm(range(0, len(y_pred_all))): writer.writerow({"Id": i, "Bound": int(y_pred_all[i])}) print("You can find results on " + filename)
fieldmap = { "congbio": "bioguide", #"fec": "fec", # handled specially... "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) "opensecrets": "opensecrets", "votesmart": "votesmart", "cspan": "cspan", } int_fields = ("govtrack", "votesmart", "cspan") # default to not caching cache = utils.flags().get('cache', False) # Load legislator files and map bioguide IDs. y1 = utils.load_data("legislators-current.yaml") y2 = utils.load_data("legislators-historical.yaml") bioguides = {} for y in y1 + y2: bioguides[y["id"]["bioguide"]] = y # Okay now the Wikipedia stuff... def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"):
def run_fix_mask(args, seed): pruning.setup_seed(seed) adj, features, labels, idx_train, idx_val, idx_test = load_data( args['dataset']) node_num = features.size()[0] class_num = labels.numpy().max() + 1 adj = adj.cuda() features = features.cuda() labels = labels.cuda() loss_func = nn.CrossEntropyLoss() net_gcn = net.net_gcn(embedding_dim=args['embedding_dim'], adj=adj) pruning.add_mask(net_gcn) net_gcn = net_gcn.cuda() print("load : {}".format(args['weight_dir'])) encoder_weight = {} cl_ckpt = torch.load(args['weight_dir'], map_location='cuda') encoder_weight['weight_orig_weight'] = cl_ckpt['gcn.fc.weight'] ori_state_dict = net_gcn.net_layer[0].state_dict() ori_state_dict.update(encoder_weight) net_gcn.net_layer[0].load_state_dict(ori_state_dict) for name, param in net_gcn.named_parameters(): if 'mask' in name: param.requires_grad = False optimizer = torch.optim.Adam(net_gcn.parameters(), lr=args['lr'], weight_decay=args['weight_decay']) acc_test = 0.0 best_val_acc = {'val_acc': 0, 'epoch': 0, 'test_acc': 0} for epoch in range(args['total_epoch']): optimizer.zero_grad() output = net_gcn(features, adj) loss = loss_func(output[idx_train], labels[idx_train]) loss.backward() optimizer.step() with torch.no_grad(): output = net_gcn(features, adj, val_test=True) acc_val = f1_score(labels[idx_val].cpu().numpy(), output[idx_val].cpu().numpy().argmax(axis=1), average='micro') acc_test = f1_score(labels[idx_test].cpu().numpy(), output[idx_test].cpu().numpy().argmax(axis=1), average='micro') if acc_val > best_val_acc['val_acc']: best_val_acc['val_acc'] = acc_val best_val_acc['test_acc'] = acc_test best_val_acc['epoch'] = epoch print( "(Fix Mask) Epoch:[{}] Val:[{:.2f}] Test:[{:.2f}] | Final Val:[{:.2f}] Test:[{:.2f}] at Epoch:[{}]" .format(epoch, acc_val * 100, acc_test * 100, best_val_acc['val_acc'] * 100, best_val_acc['test_acc'] * 100, best_val_acc['epoch'])) return best_val_acc['val_acc'], best_val_acc['test_acc'], best_val_acc[ 'epoch']
fn += 1 iff test_group[i][0] = M and classify(test+group[i] = B""" test_group = test_group FP, FN = 0, 0 size = len(test_group) for i in range(size): if self.root.find_class_by_example( test_group[i]) == 'M' and test_group[i][0] == 'B': FP += 1 elif self.root.find_class_by_example( test_group[i]) == 'B' and test_group[i][0] == 'M': FN += 1 loss = lost(FP, FN, size) return loss if __name__ == '__main__': data = load_data("train.csv") classifier = ID3(data) classifier.train() tester = load_data("test.csv") classifier.test(tester, True) """loss calc""" # loss = classifier.test_by_loss(tester) # print(loss) """this is the experiment""" # experiment("train.csv") """this is the accuracy check with M = 1""" # classifier = ID3(data, 1) # classifier.train() # classifier.test(tester, True)
import project1 as p1 import utils #------------------------------------------------------------------------------- # Data loading. There is no need to edit code in this section. #------------------------------------------------------------------------------- train_data = utils.load_data('reviews_train.tsv') # val_data = utils.load_data('reviews_val.tsv') # test_data = utils.load_data('reviews_test.tsv') train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data)) # val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data)) # test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data)) # dictionary = p1.bag_of_words(train_texts) # dictionary_no_stopwords = p1.bag_of_words_removed_stopwords(train_texts) # print("Length of Normal Dictionary:", len(dictionary), "\nLength of Dictionary Without Stopwords and Punc:", len(dictionary_no_stopwords)) # train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary) # val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary) # test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary) # # get the feature vectors with stopwords removed, punctuation removed, and words counted with frequency # train_bow_features_no_stopwords = p1.extract_bow_feature_vectors_with_frequency(train_texts, dictionary_no_stopwords) # val_bow_features_no_stopwords = p1.extract_bow_feature_vectors_with_frequency(val_texts, dictionary_no_stopwords) # test_bow_features_no_stopwords = p1.extract_bow_feature_vectors_with_frequency(test_texts, dictionary_no_stopwords) # # get the final features
model.summary() x_input = tf.placeholder(tf.float32, [None, 224, 224, 3]) y = model(x_input) t = tf.placeholder(tf.float32, [None, 10]) learning_rate = tf.placeholder(tf.float32, []) cost = earth_mover_loss(t, y) train = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost) ###################################################################################### from sklearn.utils import shuffle from sklearn.metrics import precision_score, accuracy_score, f1_score, confusion_matrix from sklearn.model_selection import train_test_split from skimage.transform import resize train_X_raw, train_y_raw = utils.load_data() train_X_raw = train_X_raw train_y_raw = train_y_raw train_X_raw, test_X_raw, train_y_raw, test_y_raw = train_test_split(train_X_raw, train_y_raw, test_size=0.1, random_state=42) test_X = np.zeros([0, image_size, image_size, 3]) for image in range(test_X_raw.shape[0]): pic = test_X_raw[image] img = resize(pic,(image_size,image_size,3)) temp = np.reshape(img, (1, image_size,image_size,3)) test_X = np.append(test_X, temp, axis=0) test_y = test_y_raw print('test size:') print(test_X.shape)
def main(argv=None): print("Loading training data..") train_data = load_data(FLAGS.train_prefix, load_walks=True) print("Done loading training data..") train(train_data)
if __name__ == "__main__": # argparse args = get_train_args() # check path_to_save existence if os.path.exists(args.path_to_save_folder): raise FileExistsError("save path folder already exists") # set seed and device set_global_seed(args.seed) device = torch.device(args.device) # load data data = load_data(path=args.path_to_data, verbose=args.verbose) # char2idx char2idx = get_char2idx(data, verbose=args.verbose) # dataset, collator, dataloader train_dataset = LMDataset( data, char2idx, max_length=args.max_length, verbose=args.verbose, ) train_collator = LMCollator(padding_value=char2idx[EOS], ) train_loader = DataLoader( train_dataset, batch_size=args.batch_size,
import os import numpy as np import random from config import config_setting from model import Model from utils import load_data from train import train from torch import nn if __name__ == '__main__': cfg = config_setting() train_loader, test_loader, features = load_data(cfg) model = Model(features, cfg) if cfg.use_cuda: os.environ['CUDA_VISIBLE_DEVICES'] = '0' model = model.cuda() print(model) train(model, train_loader, test_loader, features, cfg)
def run(): # Field mapping. And which fields should be turned into integers. # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available. fieldmap = { "congbio": "bioguide", #"fec": "fec", # handled specially... "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) "opensecrets": "opensecrets", "votesmart": "votesmart", "cspan": "cspan", } int_fields = ("govtrack", "votesmart", "cspan") # default to not caching cache = utils.flags().get('cache', False) # Load legislator files and map bioguide IDs. y1 = utils.load_data("legislators-current.yaml") y2 = utils.load_data("legislators-historical.yaml") bioguides = {} for y in y1 + y2: bioguides[y["id"]["bioguide"]] = y # Okay now the Wikipedia stuff... def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"): eicontinue = "" while True: # construct query URL, using the "eicontinue" of the last query to get the next batch url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template if eicontinue: url += "&eicontinue=" + eicontinue # load the XML print("Getting %s pages (%d...)" % (template, len(page_titles))) dom = lxml.etree.fromstring(utils.download( url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath( "string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles # Get the list of Wikipedia pages that use any of the templates we care about. page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") if cache and os.path.exists(page_list_cache_file): # Load from cache. matching_pages = open(page_list_cache_file).read().split("\n") else: # Query Wikipedia API and save to cache. matching_pages = get_matching_pages() utils.write(("\n".join(matching_pages)), page_list_cache_file) # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). matching_pages = [p for p in matching_pages if ":" not in p] # Load each page's content and parse the template. for p in sorted(matching_pages): if " campaign" in p: continue if " (surname)" in p: continue if "career of " in p: continue if "for Congress" in p: continue if p.startswith("List of "): continue if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue # Query the Wikipedia API to get the raw page content in XML, # and then use XPath to get the raw page text. url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote( p.encode("utf8")) + "&export&exportnowrap" cache_path = "legislators/wikipedia/pages/" + p dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache)) page_content = dom.xpath( "string(mw:page/mw:revision/mw:text)", namespaces={"mw": "http://www.mediawiki.org/xml/export-0.8/"}) # Build a dict for the IDs that we want to insert into our files. new_ids = { "wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores) } if "CongLinks" in page_content: # Parse the key/val pairs in the template. m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content) if not m: continue # no template? for arg in m.group(1).split("|"): if "=" not in arg: continue key, val = arg.split("=", 1) key = key.strip() val = val.strip() if val and key in fieldmap: try: if fieldmap[key] in int_fields: val = int(val) except ValueError: print("invalid value", key, val) continue if key == "opensecrets": val = val.replace("&newMem=Y", "").replace( "&newmem=Y", "").replace("&cycle=2004", "").upper() new_ids[fieldmap[key]] = val if "bioguide" not in new_ids: continue new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm bioguide = new_ids["bioguide"] else: m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content) if not m: continue # no template? bioguide = m.group(1).upper() if not bioguide in bioguides: print( "Member not found: " + bioguide, p, "(Might have been a delegate to the Constitutional Convention.)" ) continue # handle FEC ids specially because they are stored in an array... fec_id = new_ids.get("fec") if fec_id: del new_ids["fec"] member = bioguides[bioguide] member["id"].update(new_ids) # ...finish the FEC id. if fec_id: if fec_id not in bioguides[bioguide]["id"].get("fec", []): bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id) #print p.encode("utf8"), new_ids utils.save_data(y1, "legislators-current.yaml") utils.save_data(y2, "legislators-historical.yaml")
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (23, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) utt2ark, utt2idx, all_list, utt2data = {}, {}, [], {} for idx, kaldi_data_dir in enumerate(args.kaldi_data_dirs): if not os.path.exists(args.emb_out_dirs[idx]): os.makedirs(args.emb_out_dirs[idx]) feats_path = os.path.join(kaldi_data_dir, 'feats.scp') vad_path = os.path.join(kaldi_data_dir, 'vad.scp') assert os.path.exists(feats_path), 'Path `{}` does not exists.'.format(feats_path) with open(feats_path) as f: for line in f: key, ark = line.split() ark, position = ark.split(':') input_tuple = (key, ark, int(position)) utt2data[key] = ut.load_data(input_tuple, mode='eval') utt2idx[key] = idx with open(vad_path) as f: for line in f: key, ark = line.split() ark, position = ark.split(':') vad_array = None for ark_key, vec in kaldi_io.read_vec_flt_ark(ark): if key == ark_key: vad_array = np.array(vec, dtype=bool) assert vad_array is not None assert vad_array.size == utt2data[key].shape[1], 'Shapes does not fit: vad {}, mfcc {}'.format( vad_array.size, utt2data[key].shape[1]) utt2data[key] = ut.apply_cmvn_sliding(utt2data[key]).T[vad_array] # ==> load pre-trained model ??? if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loaded model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. for idx, utt in enumerate(utt2data): embedding = network_eval.predict(utt2data[utt].T[np.newaxis, :, :, np.newaxis]).squeeze() ut.write_txt_vectors( os.path.join(args.emb_out_dirs[utt2idx[utt]], 'xvector.{}.txt'.format(idx)), {utt: embedding})
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger from keras.optimizers import SGD from models import simple_CNN from utils import load_data, preprocess_input import keras.backend as K import tensorflow as tf data_path = '../datasets/fer2013/fer2013.csv' model_save_path = '../trained_models/simpler_CNN.hdf5' faces, emotions = load_data(data_path) faces = preprocess_input(faces) num_classes = emotions.shape[1] image_size = faces.shape[1:] batch_size = 128 num_epochs = 1000 model = simple_CNN(image_size, num_classes) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) csv_logger = CSVLogger('training.log') early_stop = EarlyStopping('val_acc',patience=200,verbose=1) model_checkpoint = ModelCheckpoint(model_save_path, 'val_acc', verbose=1, save_best_only=True) model_callbacks = [early_stop, model_checkpoint, csv_logger] #keras bug K.get_session().run(tf.global_variables_initializer()) model.fit(faces,emotions,batch_size,num_epochs,verbose=1, callbacks=model_callbacks,
from deepy.trainers import SGDTrainer, LearningRateAnnealer, AdamTrainer from deepy.layers import LSTM from layers import FullOutputLayer logging.basicConfig(level=logging.INFO) default_model = os.path.join(os.path.dirname(__file__), "models", "lstm_rnnlm.gz") if __name__ == '__main__': ap = ArgumentParser() ap.add_argument("--model", default="") ap.add_argument("--small", action="store_true") args = ap.parse_args() vocab, lmdata = load_data(small=args.small, history_len=5, batch_size=64) model = NeuralLM(vocab.size) model.stack( LSTM(hidden_size=100, output_type="sequence", persistent_state=True, batch_size=lmdata.size, reset_state_for_input=0), FullOutputLayer(vocab.size)) if os.path.exists(args.model): model.load_params(args.model) trainer = SGDTrainer( model, { "learning_rate": LearningRateAnnealer.learning_rate(1.2), "weight_l2": 1e-7
# parse training arguments parser = argparse.ArgumentParser() parser.add_argument('--epochs', type = int, default = 10000, help = 'Number of epochs to train.') parser.add_argument('--lr', type = float, default = 0.005, help = 'Initial learning rate.') parser.add_argument('--weight_decay', type = float, default = 5e-4, help = 'Weight decay (L2 loss on parameters).') parser.add_argument('--hidden', type = int, default = 8, help = 'Number of hidden units.') parser.add_argument('--n_heads', type = int, default = 8, help = 'Number of head attentions.') parser.add_argument('--dropout', type = float, default = 0.6, help = 'Dropout rate (1 - keep probability).') parser.add_argument('--alpha', type = float, default = 0.2, help = 'Alpha for the leaky_relu.') parser.add_argument('--patience', type = int, default = 100, help = 'Patience') args = parser.parse_args() args.use_cuda = torch.cuda.is_available() # load data adj, features, labels, idx_train, idx_val, idx_test = load_data() model = GAT(n_input = features.shape[1], n_hidden = args.hidden, n_classes = int(labels.max()) + 1, dropout = args.dropout, alpha = args.alpha, n_heads = args.n_heads) if args.use_cuda: model.cuda() features = features.cuda() adj = adj.cuda() labels = labels.cuda() idx_train = idx_train.cuda() idx_val = idx_val.cuda() idx_test = idx_test.cuda()
shuffle=data_type == "train", drop_last=False) data_loader = DataLoader(dataset, batch_sampler=sampler, collate_fn=EdgeSeqDataset.batchify, pin_memory=data_type == "train") data_loaders[data_type] = data_loader logger.info("data (data_type: {:<5s}, len: {}) generated".format( data_type, len(dataset.data))) logger.info( "data_loader (data_type: {:<5s}, len: {}, batch_size: {}) generated" .format(data_type, len(data_loader), finetune_config["batch_size"])) else: data = load_data(finetune_config["graph_dir"], finetune_config["pattern_dir"], finetune_config["metadata_dir"], num_workers=finetune_config["num_workers"]) logger.info("{}/{}/{} data loaded".format(len(data["train"]), len(data["dev"]), len(data["test"]))) for data_type, x in data.items(): if finetune_config["model"] in ["RGCN", "RGIN", "RSIN"]: if os.path.exists( os.path.join(finetune_config["save_data_dir"], "%s_dgl_dataset.pt" % (data_type))): dataset = GraphAdjDataset(list()) dataset.load( os.path.join(finetune_config["save_data_dir"], "%s_dgl_dataset.pt" % (data_type))) else: dataset = GraphAdjDataset(x)
prompt += ('====================================\n') print(prompt, end='') f = open('{0}/opt.txt'.format(run_dir), 'w') f.write(prompt) f.close() if torch.cuda.is_available(): # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = "{0}".format(opt.gpu) torch.cuda.set_device(opt.gpu) # device = torch.device('cuda:{0}'.format(opt.gpu)) # Configure data loader import utils trainset, trainset2, testset = utils.load_data(opt=opt) train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, drop_last=True, sampler=InfiniteSampler( len(trainset))) # model train_loader2 = torch.utils.data.DataLoader(trainset2, batch_size=opt.batch_size, drop_last=True, sampler=InfiniteSampler( len(trainset2))) # model test_loader = torch.utils.data.DataLoader(testset, batch_size=opt.batch_size, shuffle=True, drop_last=True) # model
trend_rdd = yeargrowth_rdd \ .map(lambda row: ((row[0][0], row[0][1]), str(row[0][2]) + ":" + utils.prettify_growth(row[1]))) \ .groupByKey() \ .map(lambda row: (row[1], (row[0]))) \ .cache() # .mapValues(iterate) \ # .mapValues(iterate) put it after groupByKey to see content of trend similartrendingcompanies_rdd = trend_rdd.join(trend_rdd) \ .filter(filter_couples) \ .collect() # sort values and remove duplicates (A,B)(B,A)) # .map(lambda row: (row[0], tuple(sorted(row[1])))).distinct() # it seems that the rdd already does combinations for kv in similartrendingcompanies_rdd: print(kv) if __name__ == "__main__": spark = utils.create_session("job2") sc = spark.sparkContext # sqlContext = SQLContext(sc) history_rdd = utils.load_data(spark, HISTORY_PATH, preview=False) legend_rdd = utils.load_data(spark, LEGEND_PATH, preview=False) run_job(history_rdd, legend_rdd)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('==> calculating test({}) data lists...'.format(args.test_type)) publicTest = pd.read_csv("/content/VoveDataset/public-test.csv") list1 = addPath(np.array(publicTest["audio_1"])) list2 = addPath(np.array(publicTest["audio_2"])) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) result_path = "/content/VGG-Speaker-Recognition/result" print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(pbar(unique_list)): specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) np.save("/content/feats.npy", feats)
preds_train = aclf.predict(x_train) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds) ################################################### # Modify for running your experiments accordingly # ################################################### if __name__ == '__main__': args = load_args() accuracy_train = [] f1_tscore = [] accuracy = [] f1_score = [] x_axis = np.arange(1, 26) max_features = [1,2,5,8,10,20,25,35,50] # x_axis = np.arange(10,220,10) random = np.arange(1,11) x_train, y_train, x_test, y_test = load_data(args.root_dir) L = np.arange(10,210,10) if args.county_dict == 1: county_info(args) if args.decision_tree == 1: for x in range(1,126): train_acc, test_acc, f1_train, f1_test = decision_tree_testing(x_train, y_train, x_test, y_test,x) accuracy_train.append(train_acc) accuracy.append(test_acc) f1_tscore.append(f1_train) f1_score.append(f1_test) plt.plot(x_axis, accuracy, label="Testing Accuracy") plt.plot(x_axis, f1_score, label = "Training F1 Score") plt.plot(x_axis, accuracy_train, label = "Training Accuracy") plt.plot(x_axis, f1_tscore, label = "Testing F1 score") plt.ylabel("Accuracy")
for proposal_folder in proposal_folders: fn_clusters = sorted(glob.glob(os.path.join(proposal_folder, fn_node_pattern))) proposals.extend([fn_node for fn_node in fn_clusters]) assert len(proposals) == len(probs) pos_lst = [] for idx, prob in enumerate(probs): if prob < args.th_pos: continue pos_lst.append([idx, prob]) pos_lst = sorted(pos_lst, key=lambda x:x[1], reverse=True) # get all clusters clusters = [] for idx, _ in tqdm(pos_lst): cluster = load_data(proposals[idx]) clusters.append(cluster) idx2lb, idx2lbs = nms(clusters, args.th_iou) # output stats multi_lb_num = 0 for _, lbs in idx2lbs.items(): if len(lbs) > 1: multi_lb_num += 1 inst_num = len(idx2lb) cls_num = len(set(idx2lb.values())) print('#inst: {}, #class: {}, #multi-label: {}'.format(inst_num, cls_num, multi_lb_num)) print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num))
from config import args from utils import load_data, build_vocab, gen_submission, gen_final_submission, eval_based_on_outputs from model import Model if __name__ == '__main__': if not args.pretrained: print('No pretrained model specified.') exit(0) build_vocab() if args.test_mode: dev_data = load_data('../data/test-data-processed.json') else: dev_data = load_data('../data/dev-data-processed.json') model_path_list = args.pretrained.split(',') for model_path in model_path_list: print('Load model from %s...' % model_path) args.pretrained = model_path model = Model(args) # evaluate on development dataset dev_acc = model.evaluate(dev_data) print('dev accuracy: %f' % dev_acc) # generate submission zip file for Codalab prediction = model.predict(dev_data) gen_submission(dev_data, prediction) gen_final_submission(dev_data) eval_based_on_outputs('./answer.txt')