def xgboost_train(sym_pred=False): train_explicit, train_implicit, train_tag = load('train') dev_explicit, dev_implicit, dev_tag = load('dev') if sym_pred: train_explicit = train_explicit + train_implicit * symptom_predict( train_explicit) dev_explicit = dev_explicit + dev_implicit * symptom_predict( dev_explicit) xg_train = xgb.DMatrix(train_explicit, label=train_tag) xg_test = xgb.DMatrix(dev_explicit, label=dev_tag) # 1.训练模型 # setup parameters for xgboost param = {} # use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 10 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 12 watchlist = [(xg_train, 'train'), (xg_test, 'test')] num_round = 100 bst = xgb.train(param, xg_train, num_round, watchlist) pred = bst.predict(xg_test) train_pred = bst.predict(xg_train) print('dev classification error=%f' % accuracy_score(dev_tag, pred)) print('train classification error=%f' % accuracy_score(train_tag, train_pred)) pickle.dump(bst, open("disease.pickle.dat", "wb"))
def main(self): """ Tests data processing methods """ try: preprocess.setup() except: print 'SETUP failed' else: print 'SETUP succeeded' try: d = preprocess.load(prefix=PAR.OBSERVATIONS) s = preprocess.load(prefix=PAR.SYNTHETICS) except: print 'LOAD failed' else: print 'LOAD succeeded' try: d = preprocess.process_traces(d) s = preprocess.process_traces(s) except: print 'PROCESS_TRACES failed' else: print 'PROCESS_TRACES succeeded'
def predict(input_path, output_path, resources_path): """ This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the BIES format. The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. :param input_path: the path of the input file to predict. :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ model = load_model(resources_path + '/model.h5') model.summary() dictionary = load(resources_path + 'dictionary') word2id = dictionary['word2id'] id2label = dictionary['id2label'] X_chinese, y, characters, sizes = file2BIES(input_path) # Process X X_processed = processX(X_chinese, word2id, sentence_size=626) y_pred = model.predict(X_processed) prediction = [] arg = np.argmax(y_pred, axis=2) for i in range(len(arg)): sentence = arg[i] labels = [] num_char = np.count_nonzero(X_processed[i]) for char in sentence[0:num_char]: labels.append(id2label[char]) prediction.append(labels) score(prediction, y, verbose=True) # Write prediction file filename, extension = os.path.splitext(output_path) with open(filename + '_prediction' + extension, "w+") as f: for line in prediction: f.write(''.join(str(e) for e in line)) f.write('\n') # Write gold file with open(output_path, "w+") as f: for line in y: f.write(''.join(str(e) for e in line)) f.write('\n') pass
def write_receivers(self): unix.cd(self.getpath) key = 'use_existing_STATIONS' val = '.true.' solvertools.setpar(key, val) _, h = preprocess.load('traces/obs') solvertools.write_receivers(h.nr, h.rx, h.rz)
def train_and_test(): train_explicit, train_implicit, train_tag = load('train') dev_explicit, dev_implicit, dev_tag = load('dev') train_symptom = np.array(train_implicit != 0, dtype=int) dev_symptom = np.array(dev_implicit != 0, dtype=int) clf_multilabel = MultiOutputClassifier( XGBClassifier(tree_method='gpu_hist', gpu_id=0, eval_metric='logloss', use_label_encoder=False)) clf_multilabel.fit(train_explicit, train_symptom) val_pred = clf_multilabel.predict(dev_explicit) print("f1 score", f1_score(dev_symptom, val_pred, average='macro')) pickle.dump(clf_multilabel, open("symptom.pickle.dat", "wb"))
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = load(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, vocab_processor, x_dev, y_dev
def write_receivers(self): unix.cd(self.getpath) key = 'use_existing_STATIONS' val = '.true.' setpar(key, val) _, h = preprocess.load('traces/obs') solvertools.write_receivers(h.nr, h.rx, h.rz)
def main(self): unix.rm(PATH.SCRATCH) unix.mkdir(PATH.SCRATCH) preprocess.setup() print 'SIMULATION 1 OF 3' system.run('solver', 'setup', hosts='all') print 'SIMULATION 2 OF 3' self.prepare_model() system.run('solver', 'eval_func', hosts='all', path=PATH.SCRATCH) print 'SIMULATION 3 OF 3' system.run('solver', 'eval_grad', hosts='all', path=PATH.SCRATCH) # collect traces obs = join(PATH.SOLVER, self.event, 'traces/obs') syn = join(PATH.SOLVER, self.event, 'traces/syn') adj = join(PATH.SOLVER, self.event, 'traces/adj') obs,_ = preprocess.load(obs) syn,_ = preprocess.load(syn) adj,_ = preprocess.load(adj, suffix='.su.adj') # collect model and kernels model = solver.load(PATH.MODEL_INIT) kernels = solver.load(PATH.SCRATCH+'/'+'kernels'+'/'+self.event, suffix='_kernel') # dot prodcut in data space keys = obs.keys() LHS = DotProductLHS(keys, syn, adj) # dot product in model space keys = ['rho', 'vp', 'vs'] # model.keys() RHS = DotProductRHS(keys, model, kernels) print print 'LHS:', LHS print 'RHS:', RHS print 'RELATIVE DIFFERENCE:', (LHS-RHS)/RHS print
def grid_search_train(train, test, subm): ''' Arguments: sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=() ''' data = preprocess.load() for sg, size, window, min_count, hs, neg, iter, sample in product( [0], [300], [5], [1], [0], [5], [25], [0.001]): # for sg,size,window,min_count,hs,neg,iter,sample in product( [1,0], # [100,300], # [5,10], # [1], # [0,1], # [5,10], # [5,25], # [0.1,0.01,0.001]): print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), sg, size, window, min_count, hs, neg, iter, sample) model = gensim.models.word2vec.Word2Vec(data['train_tokens'], sg=sg, size=size, window=window, min_count=min_count, hs=hs, negative=neg, iter=iter, sample=sample) model_name = 'sg{0}-sz{1}-win{2}-minc{3}-hs{4}-neg{5}-iter{6}-samp{7}'.format( sg, size, window, min_count, hs, neg, iter, sample) model.save('data/w2v-' + model_name + '.model') embedding_vectorizer = TfidfEmbeddingVectorizer( model.wv, data['vocabulary'], data['idf']) train_embedded = embedding_vectorizer.fit(data['train_tokens'], None) train_embedded = embedding_vectorizer.transform(data['train_tokens']) test_embedded = embedding_vectorizer.transform(data['test_tokens']) label_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] preds = np.zeros((len(test), len(label_cols))) for i, j in enumerate(label_cols): m = LogisticRegression() mf = m.fit(train_embedded, train[j]) preds[:, i] = mf.predict_proba(test_embedded)[:, 1] submid = pd.DataFrame({'id': subm["id"]}) submission = pd.concat( [submid, pd.DataFrame(preds, columns=label_cols)], axis=1) submission.to_csv( 'submission/submission-toxicw2v-doctfidf-lr-{}.csv'.format( model_name), index=False)
def initialize_adjoint_traces(self, path='traces/obs'): """ Adjoint traces are initialized by writing zeros for all components. Components actually in use during an inversion or migration will be overwritten with nonzero values later on. """ _, h = preprocess.load(path) zeros = np.zeros((h.nt, h.nr)) for channel in ['x', 'y', 'z']: preprocess.writer(zeros, h, channel=channel, prefix='traces/adj/')
def main(self): unix.rm(PATH.SCRATCH) unix.mkdir(PATH.SCRATCH) preprocess.setup() print 'SIMULATION 1 OF 3' system.run('solver', 'setup', hosts='all') print 'SIMULATION 2 OF 3' self.prepare_model() system.run('solver', 'eval_func', hosts='all', path=PATH.SCRATCH) print 'SIMULATION 3 OF 3' system.run('solver', 'eval_grad', hosts='all', path=PATH.SCRATCH) # collect traces obs = join(PATH.SOLVER, self.event, 'traces/obs') syn = join(PATH.SOLVER, self.event, 'traces/syn') adj = join(PATH.SOLVER, self.event, 'traces/adj') obs, _ = preprocess.load(obs) syn, _ = preprocess.load(syn) adj, _ = preprocess.load(adj, suffix='.su.adj') # collect model and kernels model = solver.load(PATH.MODEL_INIT) kernels = solver.load(PATH.SCRATCH + '/' + 'kernels' + '/' + self.event, suffix='_kernel') # dot prodcut in data space keys = obs.keys() LHS = DotProductLHS(keys, syn, adj) # dot product in model space keys = ['rho', 'vp', 'vs'] # model.keys() RHS = DotProductRHS(keys, model, kernels) print print 'LHS:', LHS print 'RHS:', RHS print 'RELATIVE DIFFERENCE:', (LHS - RHS) / RHS print
def initialize_adjoint_traces(self): """ Adjoint traces must be initialized by writing zeros for all components. This is because when reading traces at the start of an adjoint simulation, SPECFEM3D_GLOBE expects that all components exist. Components actually in use during an inversion or migration will be overwritten with nonzero values later on. """ _, h = preprocess.load('traces/obs') zeros = np.zeros((h.nt, h.nr)) for channel in ['x', 'y', 'z']: preprocess.writer(zeros, h, channel=channel, prefix='traces/adj')
def get_data_loader(name, train=True): print("use dataset: {}".format(name)) if name == "MNIST": return get_mnist(train) elif name == "USPS": return get_usps(train) elif name == "SVHN": return get_svhn(train) elif name == "A": return load_images('data/office/', 'amazon', batch_size=config.batch_size, is_train=train) elif name == "W": return load_images('data/office/', 'webcam', batch_size=config.batch_size, is_train=train) elif name == "D": return load_images('data/office/', 'dslr', batch_size=config.batch_size, is_train=train) elif name == "B": return load('data/image-clef/b_list.txt', batch_size=config.batch_size, is_train=train) elif name == "C": return load('data/image-clef/c_list.txt', batch_size=config.batch_size, is_train=train) elif name == "I": return load('data/image-clef/i_list.txt', batch_size=config.batch_size, is_train=train) elif name == "P": return load('data/image-clef/p_list.txt', batch_size=config.batch_size, is_train=train)
def main_load_and_plot(): """ Loads the dataset and project it on the region """ from mpl_toolkits.basemap import Basemap from matplotlib import pyplot as plt import matplotlib.cm as cm import time, datetime column_indexes, data = load() long_index, lat_index = column_indexes.index( 'location-long'), column_indexes.index('location-lat') tmp_index = column_indexes.index('timestamp') lats, longs, i = [], [], 0 tmpstamps = [] for w in data: lon, lat = w[long_index], w[lat_index] tmp = w[tmp_index] if lon == '' or lat == '': i += 1 continue tmp = time.mktime( datetime.datetime.strptime(tmp, "%Y-%m-%d %H:%M:%S.000").timetuple()) lats.append(float(lat)) longs.append(float(lon)) tmpstamps.append(int(tmp)) print("Skipped " + str(i) + " data point.") # projection='ortho', projection='mill' m = Basemap(projection='mill', llcrnrlon=-10, llcrnrlat=2, urcrnrlon=70, urcrnrlat=70, lon_0=30, lat_0=35, resolution='l') x1, y1 = m(longs, lats) m.scatter(x1, y1, s=30, c=tmpstamps, marker="o", cmap=cm.cool, alpha=0.7) m.drawmapboundary(fill_color='black') # fill to edge m.drawcountries() m.fillcontinents(color='white', lake_color='black', zorder=0) plt.colorbar() plt.show()
def fix_near_field(self, path=''): """ """ import preprocess preprocess.setup() name = solver.check_source_names()[solver.getnode] fullpath = path + '/' + name g = solver.load(fullpath, suffix='_kernel') if not PAR.FIXRADIUS: return mesh = self.getmesh() x, z = self.getxz() lx = x.max() - x.min() lz = z.max() - z.min() nn = x.size nx = np.around(np.sqrt(nn * lx / lz)) nz = np.around(np.sqrt(nn * lz / lx)) dx = lx / nx dz = lz / nz sigma = 0.5 * PAR.FIXRADIUS * (dx + dz) _, h = preprocess.load(solver.getpath + '/' + 'traces/obs') # mask sources mask = np.exp(-0.5 * ((x - h.sx[0])**2. + (z - h.sy[0])**2.) / sigma**2.) for key in solver.parameters: weight = np.sum(mask * g[key][0]) / np.sum(mask) g[key][0] *= 1. - mask g[key][0] += mask * weight # mask receivers for ir in range(h.nr): mask = np.exp(-0.5 * ((x - h.rx[ir])**2. + (z - h.ry[ir])**2.) / sigma**2.) for key in solver.parameters: weight = np.sum(mask * g[key][0]) / np.sum(mask) g[key][0] *= 1. - mask g[key][0] += mask * weight solver.save(fullpath, g, suffix='_kernel')
def fix_near_field(self, path=''): """ """ import preprocess preprocess.setup() name = solver.check_source_names()[solver.getnode] fullpath = path +'/'+ name g = solver.load(fullpath, suffix='_kernel') if not PAR.FIXRADIUS: return mesh = self.getmesh() x,z = self.getxz() lx = x.max() - x.min() lz = z.max() - z.min() nn = x.size nx = np.around(np.sqrt(nn*lx/lz)) nz = np.around(np.sqrt(nn*lz/lx)) dx = lx/nx dz = lz/nz sigma = 0.5*PAR.FIXRADIUS*(dx+dz) _, h = preprocess.load(solver.getpath +'/'+ 'traces/obs') # mask sources mask = np.exp(-0.5*((x-h.sx[0])**2.+(z-h.sy[0])**2.)/sigma**2.) for key in solver.parameters: weight = np.sum(mask*g[key][0])/np.sum(mask) g[key][0] *= 1.-mask g[key][0] += mask*weight # mask receivers for ir in range(h.nr): mask = np.exp(-0.5*((x-h.rx[ir])**2.+(z-h.ry[ir])**2.)/sigma**2.) for key in solver.parameters: weight = np.sum(mask*g[key][0])/np.sum(mask) g[key][0] *= 1.-mask g[key][0] += mask*weight solver.save(fullpath, g, suffix='_kernel')
def get_paths(): from preprocess import load column_indexes, data = load() import time, datetime import itertools from operator import itemgetter long_index, lat_index = column_indexes.index( 'location-long'), column_indexes.index('location-lat') tmp_index = column_indexes.index('timestamp') local_tag_index = column_indexes.index('tag-local-identifier') groups = itertools.groupby(data, key=lambda x: x[local_tag_index]) i = 0 path_by_tag_id = {} for k, g in groups: path_by_tag_id[k] = [] # lats, longs = [], [] # tmpstamps = [] for w in g: lon, lat = w[long_index], w[lat_index] tmp = w[tmp_index] tmp = time.mktime( datetime.datetime.strptime( tmp, "%Y-%m-%d %H:%M:%S.000").timetuple()) if lon == '' or lat == '': i += 1 continue path_by_tag_id[k].append((float(lon), float(lat), int(tmp))) # Sorting each path time wise for k in path_by_tag_id.keys(): path_by_tag_id[k] = sorted(path_by_tag_id[k], key=itemgetter(2)) return path_by_tag_id
def run_training(): train = preprocess.load(path=config.PATH, filename=config.FILENAME, col_list=[0, 2, 3, 4, 6, 7, 8, 9], dtypes=config.DTYPES) targets = train[config.POSSIBLE_LABELS] class_weights = custom_loss.calculating_class_weights(targets.values, 6) with open(os.path.join(config.PATH, "class_weights.pkl"), "wb") as f: pickle.dump(class_weights, f) # extract and load pretrained wordembeddings word2vec = word_embeddings.load_pretr_wv() # tokenize and pad sequences tokenizer_object = Tokenize_Object(config.PATH) data = tokenizer_object.tokenize(train) # save tokenizer tokenizer_object.save_tokenizer() # apply embeddings to layer weights embedding_layer = word_embeddings.apply_embeddings(word2vec) # build model m = model.build_model(embedding_layer) # apply class weights via custom loss m.compile(loss=custom_loss.get_weighted_loss(class_weights), optimizer='adam', metrics=['accuracy']) X_train, X_valid, y_train, y_valid = preprocess.split_train_valid( data, train[config.POSSIBLE_LABELS]) # fitting the model print('Training model...') r = m.fit( X_train, y_train, epochs=config.EPOCHS, batch_size=config.BATCH_SIZE, validation_data=(X_valid, y_valid), verbose=2, callbacks=[config.callback_checkpoint, config.callback_earlystop])
def conv_net(x, keep_prob): """ Create a convolutional neural network model : x: Placeholder tensor that holds image data. : keep_prob: Placeholder tensor that hold dropout keep probability. : return: Tensor that represents logits """ # TODO: Apply 1, 2, or 3 Convolution and Max Pool layers # Play around with different number of outputs, kernel size and stride # Function Definition from Above: # conv2d_maxpool(x_tensor, conv_num_outputs, conv_ksize, conv_strides, pool_ksize, pool_strides) layer = nl.conv2d_maxpool(x, 16, (4, 4), (1, 1), (2, 2), (2, 2)) tf.nn.dropout(layer, keep_prob=keep_prob) # TODO: Apply a Flatten Layer # Function Definition from Above: # flatten(x_tensor) layer = nl.flatten(layer) # TODO: Apply 1, 2, or 3 Fully Connected Layers # Play around with different number of outputs # Function Definition from Above: # fully_conn(x_tensor, num_outputs) layer = nl.fully_conn(layer, 400) layer = tf.nn.dropout(layer, keep_prob) # TODO: Apply an Output Layer # Set this to the number of classes # Function Definition from Above: # output(x_tensor, num_outputs) categories = preprocess.load('data/categories.p') res = nl.output(layer, len(categories)) # TODO: return output return res
def main(): ############## PRE PROCESS DATA (only once) ############################# data = preprocess.load(filename) clean_data = preprocess.clean(data) ############## READ CLEANED DATA ######################################## data = pd.read_csv(filename_clean) data = data.drop(columns=["Unnamed: 0"]) print(data.head()) ############## EXTRACT FEATURES ######################################### unobtrusive = data ## Create dataset including obtrusive features ## # removing all redundant columns / keeping those that we want features for cols_to_keep = ["id", "time", "mood", "sun", \ "rain", "max_temp", "total_appuse", "activity", "circumplex.arousal", \ "circumplex.valence", "weekdaydummy0", "weekdaydummy1", "weekdaydummy2", \ "weekdaydummy3", "weekdaydummy4", "weekdaydummy5", "weekdaydummy6"] data = data[cols_to_keep] # creating lagged variables for the following columns (with defined durations) columns_to_lag = [ "mood", "circumplex.arousal", "circumplex.valence", "total_appuse", "max_temp" ] lags = [4, 3, 3, 3, 3] for i, col in enumerate(columns_to_lag): data = pivot.create_lagged_vars(data, col, lags=lags[i]) # many rows are unusable so we drop them data = data.dropna() data.to_csv("with_features.csv") ## Creating unobtrusive-only dataset ## # removing all redundant columns / keeping those that we want features for un_cols_to_keep = ["id", "time", "mood", "sun", \ "rain", "max_temp", "total_appuse", "activity", "weekdaydummy0", "weekdaydummy1", \ "weekdaydummy2", "weekdaydummy3", "weekdaydummy4", "weekdaydummy5", "weekdaydummy6"] unobtrusive = unobtrusive[un_cols_to_keep] # creating lagged variables for the following columns (with defined durations) un_columns_to_lag = ["total_appuse", "max_temp"] lags = [4, 3] for i, col in enumerate(un_columns_to_lag): unobtrusive = pivot.create_lagged_vars(unobtrusive, col, lags=lags[i]) # many rows are unusable so we drop them unobtrusive = unobtrusive.dropna() unobtrusive.to_csv("unobtrusive_with_features.csv") ## Correlations features = pd.read_csv('with_features.csv', index_col=0) correlations = calculate_pvalues(features) correlations.to_csv('correlations.csv') correlations = correlations.astype(float) correlations = correlations.drop(['time'], axis=1) correlations = correlations.drop(['time', 'mood', 'total_appuse_lag2', 'total_appuse_lag3', \ 'max_temp_lag2', 'max_temp_lag3', 'circumplex.arousal_lag2', 'circumplex.arousal_lag3', \ 'circumplex.valence_lag2', 'circumplex.valence_lag3'], axis=0) plt.figure() sns.heatmap( correlations[['mood', 'circumplex.arousal', 'circumplex.valence']], vmin=0, vmax=1, center=0.5, linewidth=3) plt.show()
from sklearn.preprocessing import label_binarize from sklearn.metrics import roc_auc_score import numpy as np import configparser from preprocess import load if __name__ == '__main__': p = [ 'mozilla4.arff', 'waveform-5000.arff', 'diabetes.arff', 'pc5.arff', 'pc1.arff' ] cl = ['naive bayes', 'decision tree', 'KNN', 'MLP', 'LinearSVM'] for fpath in p: print(fpath) X, y = load(fpath) n_classes = np.arange(np.unique(y).size) for j in range(2, 3): start_time = time.time() if j == 0: clf = BaggingClassifier(base_estimator=GaussianNB(), n_estimators=10, max_samples=0.5, max_features=0.5) elif j == 1: clf = BaggingClassifier(base_estimator=DecisionTreeClassifier( random_state=0, criterion='gini'), n_estimators=10, max_samples=0.5, max_features=0.5) elif j == 2:
def write_sources(self): unix.cd(self.getpath) _, h = preprocess.load(dir='traces/obs') solvertools.write_sources(vars(PAR), h)
print("Bot" + sentence_out) except: print("Error") def chatBegin(){ #存放用于测试聊天用的代码 SOS_token = 2 EOS_token = 3 train_save_path = 'D:/1000.pth' # corpus_paris_path = 'D:/corpus_paris.csv' corpus_paris_path = "D:/clean_chat_corpus/xiaohuangji_processed.tsv" dict_path = "D:/clean_chat_corpus/xiaohuangji_dict.tsv" word2index = [] index2word = [] word2index, index2word = preprocess.load(dict_path) num_words = len(index2word) num_layers = 2 dropout = 0.1 hidden_size = 256 output_size = num_words embedding = torch.nn.Embedding(num_words, hidden_size) learning_rate = 0.0001 decoder_lr_ratio = 5 encoder_lr = learning_rate decoder_lr = learning_rate * decoder_lr_ratio total_gen = 5000 batch_size = 1024
parser.error('norm must be 1 or 2') sys.exit(1) print(__doc__) parser.print_help() print() ################################################################################ # core logic ################################################################################ if __name__ == '__main__': # load the data # TODO make the data to load a parameter data_obj = load('preprocessed_data/prd.pkl') # TODO choose whether to load data by year or by animal # if by year, select year, else select animal - print out options data_by_year = data_obj.get_data_by_year() year, indivs = data_by_year[0] print('year') print(year) print('num individuals for the year') print(len(indivs)) print() # get the time series of interest from the loaded data tss = [] for indiv_id, pts in indivs: tss.append(pts)
from sklearn.neural_network import MLPClassifier from sklearn.svm import LinearSVC from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_auc_score from sklearn.preprocessing import label_binarize from preprocess import load from sklearn.pipeline import Pipeline from sklearn.neighbors import NeighborhoodComponentsAnalysis p = ['mozilla4.arff', 'waveform-5000.arff', 'diabetes.arff', 'pc5.arff', 'pc1.arff'] path = p[0] X, y = load(path) n_classes = np.arange(np.unique(y).size) # clf = DecisionTreeClassifier(random_state=0, criterion='gini') # clf = GaussianNB() # clf = KNeighborsClassifier(n_neighbors=3) # clf = MLPClassifier(hidden_layer_sizes=(100), # activation='relu', # solver='adam', # batch_size=128, # alpha=1e-4, # learning_rate_init=1e-3, # learning_rate='adaptive', # tol=1e-4, # max_iter=200)
print('Number of clusters: %d' % n_clusts) print('_' * 80) # create a clusterer clusterer = TsClusterer(n_clusts, dist_norm, max_iterations, stopping_threshold) avg_err = cv.cross_validate(clusterer, distance_metric=dist_metric, window=window) errs.append((n_clusts, avg_err, clusterer)) print('Average error of %f achieved using %d clusters' % (avg_err, n_clusts)) print() return errs if __name__ == '__main__': print('file path: %s' % args.source_path) print() # Load the clusterers clusterers = load(args.source_path) print_errors(clusterers) plot_errors(clusterers) if args.best_n_clusts: print('Getting the clusterer for %d clusters' % args.best_n_clusts) clusterer = get_clusterer_for_best_n_clusts(clusterers, args.best_n_clusts) if not clusterer: print('No clusterer found for %d clusters' % args.best_n_clusts) sys.exit(1) print('Done') print() assignments, lowest_err = clusterer.get_best_assignment()
help='How big is each word vector') parser.add_argument('--preprocess', action='store_true', help='Redo preprocessing.') parser.add_argument('--embed', action='store_true', help='Redo embedding preprocessing.') # Setup params = parser.parse_args() if params.debug: print('Running in debug mode.') seed_everything() if params.preprocess: x_train, x_test, y_train, features, test_features, word_index = preprocess.preprocess( params) else: x_train, x_test, y_train, features, test_features, word_index = preprocess.load( params) if params.embed: embedding_matrix = embeddings.process(params) else: embedding_matrix = embeddings.load(params) preds = run(x_train, y_train, features, test_features, x_test, embedding_matrix, params)
else: # try appending preprocessed_data/ args.source_path = 'preprocessed_data/' + args.source_path if not os.path.exists(args.source_path): parser.error('The file %s does not exist' % args.source_path) sys.exit(1) print(__doc__) parser.print_help() print() if __name__ == '__main__': print('file path: %s' % args.source_path) print() data = load(args.source_path) data_by_year = data.get_data_by_year() print('num years') print(len(data_by_year)) year_0, indivs = data_by_year[0] print('year') print(year_0) print('number of indiviuals') print(len(indivs)) indiv_0, pts = indivs[0] print('individual') print(indiv_0) print('shape of path') print(pts.shape)
from preprocess import load from sklearn.pipeline import Pipeline if __name__ == '__main__': p = [ 'mozilla4.arff', 'waveform-5000.arff', 'diabetes.arff', 'pc5.arff', 'pc1.arff' ] cl = [ 'naive bayes', 'decision tree', 'KNN', 'MLP', 'LinearSVM', 'improve of bagging knn' ] for fpath in p: X, Y = load(fpath) for i in range(5, 6): start_time = time.time() if i == 0: clf = BaggingClassifier(base_estimator=GaussianNB(), n_estimators=10, max_samples=0.5, max_features=0.5) elif i == 1: clf = BaggingClassifier(base_estimator=DecisionTreeClassifier( random_state=0, criterion='gini'), n_estimators=10, max_samples=0.5, max_features=0.5) elif i == 2: clf = BaggingClassifier(
def fix_near_field(self, path=''): """ """ import preprocess preprocess.setup() name = solver.check_source_names()[solver.getnode] fullpath = path + '/' + name #print 'DB: name=', name #print 'DB: fullpath=', fullpath g = solver.load(fullpath, suffix='_kernel') g_vec = solver.merge(g) nproc = solver.mesh.nproc #print 'DB: len(g_vec)=', len(g_vec) if not PAR.FIXRADIUS: return x, y, z = self.getcoords() #print 'DB: len(g)=', len(g) #print 'DB: len(g[vp][0])=', len(g['vp'][0]) #print 'DB: x.shape=', x.shape #print 'DB: len(x)=', len(x) ##sys.exit("DB: stop from postporcess-regularize") lx = x.max() - x.min() ly = y.max() - y.min() lz = z.max() - z.min() nn = x.size nx = np.around(np.sqrt(nn * lx / (lz * ly))) ny = np.around(np.sqrt(nn * ly / (lx * lz))) nz = np.around(np.sqrt(nn * lz / (lx * ly))) dx = lx / nx * 1.25 dy = ly / ny * 1.25 dz = lz / nz * 1.25 #print 'DB: lx=', lx #print 'DB: ly=', ly #print 'DB: lz=', lz #print 'DB: nn=', nn #print 'DB: nx=', nx #print 'DB: ny=', ny #print 'DB: nz=', nz #print 'DB: dx=', dx #print 'DB: dy=', dy #print 'DB: dz=', dz sigma = PAR.FIXRADIUS * (dx + dz + dy) / 3.0 _, h = preprocess.load(solver.getpath + '/' + 'traces/obs') # mask sources mask = np.exp(-0.5 * ((x - h.sx[0])**2. + (y - h.sy[0])**2. + (z - h.sz[0])**2.) / sigma**2.) # mask top # for matlab # z_sqrt=(abs(z).^(0.25)); depth_scale=1-z_sqrt/max(z_sqrt); figure; plot(depth_scale,z); z_factor = np.power(abs(z), 0.5) #max_z_factor = np.amax(z_factor) #scale_depth = 1.0 - z_factor/max_z_factor #print 'DB: max(z_factor)=',max_z_factor #print 'DB: max(scale_depth)=',np.amax(scale_depth) #print 'DB: min(scale_depth)=',np.amin(scale_depth) #mask *= scale_depth #mask_depth = solver.split(z) mask_depth = solver.split(z_factor) mask_d = solver.split(mask) ##print 'DB: sigma=',sigma ##print 'DB: mask=',mask #print 'DB: len(mask)=', len(mask) #print 'DB: len(mask_d)=', len(mask_d) ##print 'DB: len(g)=', len(g) ##print 'DB: len(g)[vp][0]=', len(g['vp'][0]) for key in solver.parameters: for iproc in range(nproc): #print 'DB: key, iproc=', key, iproc #print 'DB: len(g[key][iproc])=', len(g[key][iproc]) #print 'DB: len(mask_d[key][iproc])=', len(mask_d[key][iproc]) weight = np.sum(mask_d['vp'][iproc] * g[key][iproc]) / np.sum( mask_d['vp'][iproc]) #print 'DB: key, iproc, weigth= ', key, iproc, weight g[key][iproc] *= 1. - mask_d['vp'][iproc] g[key][iproc] *= mask_depth['vp'][iproc] #g[key][iproc] += mask_d['vp'][iproc]*weight #weight = np.sum(mask_d['vp'][iproc]*g[key][iproc])/np.sum(mask_d['vp'][iproc]) ##print 'DB: key, iproc, weigth= ', key, iproc, weight #g[key][iproc] *= 1.-mask_d['vp'][iproc] #g[key][iproc] += mask_d['vp'][iproc]*weight # mask receivers #for ir in range(h.nr): # mask = np.exp(-0.5*((x-h.rx[ir])**2.+(z-h.ry[ir])**2.)/sigma**2.) # for key in solver.parameters: # weight = np.sum(mask*g[key][0])/np.sum(mask) # g[key][0] *= 1.-mask # g[key][0] += mask*weight solver.save(fullpath, g, suffix='_kernel')
parser.add_argument('--lang', type=str, default='pt', help='pt or es') parser.add_argument('--loss_fn', default=torch.nn.BCEWithLogitsLoss(reduction='sum'), help='How big is each word vector') parser.add_argument('--preprocess', action='store_true', help='Redo preprocessing.') parser.add_argument('--embed', action='store_true', help='Redo embedding preprocessing.') # Setup params = parser.parse_args() if params.debug: print('Running in debug mode.') if params.preprocess: x_train, y_train, x_test = preprocess.preprocess(params) else: x_train, y_train, x_test = preprocess.load(params) print('Loaded:') print(x_train.describe()) """if params.embed: embedding_matrix = embeddings.process(params) else: embedding_matrix = embeddings.load(params)""" preds = run(x_train, y_train, x_test, params)
class test_preprocess(object): """ Preprocess integration test Not yet implemented. The following is just a sketch. None of the methods work yet. """ def check(self): """ Checks parameters and paths """ #raise NotImplementedError # mute settings if 'MUTE' not in PAR: setattr(PAR, 'MUTE', False) if 'MUTESLOPE' not in PAR: setattr(PAR, 'MUTESLOPE', 0.) if 'MUTECONST' not in PAR: setattr(PAR, 'MUTECONST', 0.) # filter settings if 'BANDPASS' not in PAR: setattr(PAR, 'BANDPASS', False) if 'FREQLO' not in PAR: setattr(PAR, 'FREQLO', 0.) if 'FREQHI' not in PAR: setattr(PAR, 'FREQHI', 0.) # check paths if 'OBSERVATIONS' not in PATH: raise Exception if 'SYNTHETICS' not in PATH: raise Exception if 'OUTPUT' not in PATH: raise Exception def main(self): """ Tests data processing methods """ try: preprocess.setup() except: print 'SETUP failed' else: print 'SETUP succeeded' try: d, h = preprocess.load(prefix=PATH.OBSERVATIONS) s, h = preprocess.load(prefix=PATH.SYNTHETICS) except: print 'LOAD failed' else: print 'LOAD succeeded' try: d = preprocess.multichannel(preprocess.process_traces, [d], [h]) s = preprocess.multichannel(preprocess.process_traces, [s], [h]) except: print 'PROCESS_TRACES failed' else: print 'PROCESS_TRACES succeeded' try: preprocess.save(d, h, prefix=PATH.OBSERVATIONS_PRE) preprocess.save(s, h, prefix=PATH.SYNTHETICS_PRE) except: print 'OUTPUT_TRACES failed' else: print 'OUTPUT_TRACES succeeded'
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS(sys.argv) print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = load(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)