def main(args): word_embeddings = p.load(open(args.word_embeddings, 'rb')) word_embeddings = np.array(word_embeddings) word2index = p.load(open(args.vocab, 'rb')) data = read_data(args.contexts, args.questions, args.ids, args.labels, args.max_post_len, args.max_ques_len) N = len(data) train_data = data[:int(0.8 * N)] test_data = data[int(0.8 * N):] print('No. of train_data %d' % len(train_data)) print('No. of test_data %d' % len(test_data)) ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs = \ preprocess_data(train_data, word2index, args.max_post_len, args.max_ques_len) q_train_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs = \ preprocess_data(test_data, word2index, args.max_post_len, args.max_ques_len) q_test_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs run_classifier(q_train_data, q_test_data, word_embeddings, args, n_layers=2)
def eval(): # test_dir = './eval/' checkpoint_dir = './logs' y = tf.placeholder(tf.float32, [BATCH_SIZE,10], name='y') z = tf.placeholder(tf.float32, [None, 100], name='z') G = generator(z,y) data_x, data_y = read_data() sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100)) sample_labels = data_y[120:184] print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(checkpoint_dir) #saver saver = tf.train.Saver(tf.all_variables()) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.InteractiveSession(config = config) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name)) test_sess = sess.run(G, feed_dict={z: sample_z, y:sample_labels}) save_image(test_sess, [8,8], test_dir +'test_%d.png' % 500) sess.close()
def eval(): # test_dir = './eval/' checkpoint_dir = './logs' y = tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y') z = tf.placeholder(tf.float32, [None, 100], name='z') G = generator(z, y) data_x, data_y = read_data() sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100)) sample_labels = data_y[120:184] print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(checkpoint_dir) #saver saver = tf.train.Saver(tf.all_variables()) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.InteractiveSession(config=config) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name)) test_sess = sess.run(G, feed_dict={z: sample_z, y: sample_labels}) save_image(test_sess, [8, 8], test_dir + 'test_%d.png' % 500) sess.close()
def review_list_view(request): business_url = '' if request.method == "GET": business_url = request.GET['url'] [review, user, business] = read_data() print(type(review)) print(type(user)) print(type(business)) return JsonResponse({ 'review': list(review), 'user': list(review), 'business': list(business), }) context = { 'business': { 'url': business_url, }, 'result': [], 'unrecommend': [] } # reviews if status == 'finished': return render(request, "review_list.html", context) return render(request, "review_list.html", context)
def main(): # 1.import data dataset = 'WBCD.csv' Data = read_data(dataset) # 2.Fill with median Data = fillMed(Data) # dnalyze by describing data #print(Data.columns.values) # preview the data #print(Data.head()) #print(Data.tail()) #print(Data.describe()) #print(Data.describe(include=['O'])) for i in range(1, (Data.shape[1] - 1)): #print(Data[[names[i], names[-1]]].groupby([names[i]], \ # as_index=False).mean().sort_values(by=names[-1], ascending=False)) g = sns.FacetGrid(Data, col=names[-1]) g.map(plt.hist, names[i], bins=20) if SAVE == True: plt.savefig(directory + '/' + names[i] + '_' + names[-1] + '.png') plt.close() fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(Data.corr(), vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(0, Data.shape[1], 1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(names, fontsize=6) ax.set_yticklabels(names) if SAVE == True: plt.savefig(directory + '/' + 'CorrMat.png') plt.close() corrArray = Data.corr().values()[-1, :] corrFrame = pd.DataFrame(data=corrArray, index=names, columns=['Correlation']) corrFrame_sorted = corrFrame.sort_values(['Correlation'], ascending=False) #print(corrFrame_sorted) indexBar = range(len(names) - 2) plt.bar(indexBar, (corrFrame_sorted.values)[1:-1], align='center') plt.xlabel('Features', fontsize=16) plt.ylabel('Correlation of tumor being malignant', fontsize=16) plt.xticks(indexBar, (corrFrame_sorted.index.values)[1:-1], fontsize=6, rotation=0) plt.ylim((0, 1)) if SAVE == True: plt.savefig(directory + '/' + 'CorrBar.png') plt.close()
def hoeffding(erisk,delta): # Get the whoel dataset to find m (xvals,yvals) = read_data(1) m = xvals.shape[0] # Calculate the Hoeffding's Confidence Interval with empirical risk of algorithms lower = erisk - sqrt(log(2/delta)/(2*m)) upper = erisk + sqrt(log(2/delta)/(2*m)) return (lower,upper)
def logit(train_x, train_y, test_x, test_y): print("\nLogistic Regression Outputs") print("========================================================") # Create functino for logistic regression clf = LogisticRegression(class_weight={-1: 1.5, 1: 1}) # Fitting train data to train the model model = clf.fit(train_x, train_y) # Use test data to test the accuracy of our model score = model.predict(test_x) testerror = 1 - model.score(test_x, test_y) trainerror = 1 - model.score(train_x, train_y) print('Training Error: %2.3f' % trainerror) print('Testing Error: %2.3f' % testerror) # Create a table to visualize our accuracy and scores print("\nScore matrix: ") print(confusion_matrix(test_y, score)) # Create report to show precision recall f-score and support print("Classification Report:") print(classification_report(test_y, score)) # Try k-fold cross validation on 10 folds (xvals, yvals) = read_data(1) score_accuracy = cross_validate(clf, xvals, yvals, cv=10, scoring='accuracy', return_train_score=True) #score_auc = cross_val_score(clf,xvals,yvals,cv=10,scoring='roc_auc') print('K-fold cross validation results:') print("Average train accuracy is %2.3f" % score_accuracy['train_score'].mean()) print("Average test accuracy is %2.3f \n " % score_accuracy['test_score'].mean()) # Hoeffding's CI_L, CI_U = hoeffding(testerror, 0.05) print("Hoeffding's Confidence interval for Logistic Regression is:") print(CI_L, CI_U) # For k-folds kerisk = 1 - score_accuracy['test_score'].mean() kCI_L, kCI_U = hoeffding(kerisk, 0.05) print("\nHoeffding's Confidence interval for LR after k-folds is:") print(kCI_L, kCI_U) return (CI_L, CI_U)
def test_model(sess, model, label): if label == 't': user_ratings = sess.run(model.all_ratings_t) DIR = '../dataset/' + DATASET_T + '/' else: user_ratings = sess.run(model.all_ratings_s) DIR = '../dataset/' + DATASET_S + '/' train_path = DIR + 'train_data.json' teat_path = DIR + 'test_data.json' validation_path = DIR + 'validation_data.json' ## load data train_data = read_data(train_path, 1)[0] teat_vali_path = validation_path if operator.eq( TEST_VALIDATION, 'Validation') == 1 else teat_path test_data = read_data(teat_vali_path, 1)[0] result = [] for u in range(len(user_ratings)): if len(test_data[u]) > 0: result.append( test_one_user([user_ratings[u], train_data[u], test_data[u]])) result = np.array(result) F1, NDCG = np.mean(np.array(result), axis=0) return F1, NDCG
def train(path, path_out, lr, epochs): """ Create and train a bidirectional LSTM RNN. :param path: path to files :param lr: learning rate :param epochs: number of epochs """ # create dictionaries w2i, i2w, t2i, i2t, l2i, i2l = create_dictionaries(path) # create model model = LSTM_RNN(w2i, i2w, t2i, i2t, l2i, i2l) # make data ready for usage train_input, golden_scores, golden_labels, data_dim = read_data(path, w2i, i2w, t2i, i2t, l2i, i2l) # define loss function: cross entropy loss cross_entropy_loss = nn.CrossEntropyLoss() # define optimize method: stochastic gradient descent optimizer = torch.optim.SGD(model.parameters(), lr=lr) for epoch in range(epochs): # sentence and target are matrix for j in range(data_dim): # zero gradient buffers optimizer.zero_grad() # clear hidden model.hidden = model.init_hidden() # find output of network scores, labels = model(train_input[j]) # calculate the loss score_loss = cross_entropy_loss(torch.transpose(scores, 0, 1), golden_scores[j]) label_loss = cross_entropy_loss(labels, golden_labels[j]) total_loss = score_loss + label_loss # backpropagate the error total_loss.backward() # update the weights optimizer.step() torch.save(model, path_out) return model
def __init__(self): self.input_choses = { "1": self.plot_band, "2": self.plot_dos, "3": self.run, } self.framework_choises = { "1": self.vasp, "2": self.qespresso, "0": self.quit, "4": self.input_data, "5": self.automatic } self.test_data = read_data()
def prepare_data(post_data_tsv, qa_data_tsv, train_ids_file, test_ids_file, sim_ques_filename): p_input_data, q_data, train_triples, test_triples = read_data(post_data_tsv, qa_data_tsv, train_ids_file, test_ids_file, sim_ques_filename) triples = train_triples + test_triples print("Indexing words...") for triple in triples: p_input_data.index_words(triple[0]) for q in triple[1]: q_data.index_words(q) q_data.index_words(triple[2]) print('Indexed %d words in post input, %d words in ques' % (p_input_data.n_words, q_data.n_words)) p_input_data.trim_using_tfidf() q_data.trim(MIN_COUNT) return p_input_data, q_data, train_triples, test_triples
def main(args): print('Enter main') word_embeddings = p.load(open(args.word_embeddings, 'rb')) print(('Loaded emb of size %d' % len(word_embeddings))) word_embeddings = np.array(word_embeddings) word2index = p.load(open(args.vocab, 'rb')) index2word = reverse_dict(word2index) index2kwd, kwd2index, index2cnt = read_kwd_vocab(args.kwd_vocab) test_data = read_data(args.test_context, args.test_question, args.test_ids, args.max_post_len, args.max_ques_len, mode='test') print('No. of test_data %d' % len(test_data)) if args.eval_kwd: run_eval_kwd(test_data, word_embeddings, word2index, index2word, kwd2index, index2kwd, args) else: run_model(test_data, word_embeddings, word2index, index2word, kwd2index, index2kwd, args)
def eval(): # 用于存放测试图片 test_dir = '/home/jackie/workspace/gan/dcgan/eval/' # 从此处加载模型 checkpoint_dir = '/home/jackie/workspace/gan/dcgan/logs/' y= tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y') z = tf.placeholder(tf.float32, [None, 100], name='z') G = generator(z, y) data_x, data_y = read_data() sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100)) sample_labels = data_y[120: 184] # 读取 ckpt 需要 sess,saver print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(checkpoint_dir) # saver saver = tf.train.Saver(tf.all_variables()) # sess os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.InteractiveSession(config=config) # 从保存的模型中恢复变量 if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name)) # 用恢复的变量进行生成器的测试 test_sess = sess.run(G, feed_dict = {z: sample_z, y: sample_labels}) # 保存测试的生成器图片到特定文件夹 save_images(test_sess, [8, 8], test_dir + 'test_%d.png' % 500) sess.close()
def build_alf_model(filename, tag='', pool_type='multiprocessing'): """ - based on alf.f90 - `https://github.com/cconroy20/alf/blob/master/src/alf.f90` Master program to fit the absorption line spectrum, or indices, # of a quiescent (>1 Gyr) stellar population # Some important points to keep in mind: # 1. The prior bounds on the parameters are specified in set_pinit_priors. # Always make sure that the output parameters are not hitting a prior. # 2. Make sure that the chain is converged in all relevant parameters # by plotting the chain trace (parameter vs. chain step). # 3. Do not use this code blindly. Fitting spectra is a # subtle art and the code can easily fool you if you don't know # what you're doing. Make sure you understand *why* the code is # settling on a particular parameter value. # 4. Wavelength-dependent instrumental broadening is included but # will not be accurate in the limit of modest-large redshift b/c # this is implemented in the model restframe at code setup time # 5. The code can fit for the atmospheric transmission function but # this will only work if the input data are in the original # observed frame; i.e., not de-redshifted. # 6. I've found that Nwalkers=1024 and Nburn=~10,000 seems to # generically yield well-converged solutions, but you should test # this yourself by fitting mock data generated with write_a_model # To Do: let the Fe-peak elements track Fe in simple mode """ ALFPY_HOME = os.environ['ALFPY_HOME'] for ifolder in [ 'alfvar_models', 'results_emcee', 'results_dynesty', 'subjobs' ]: if os.path.exists(ALFPY_HOME + ifolder) is not True: os.makedirs(ALFPY_HOME + ifolder) pickle_model_name = '{0}alfvar_models/alfvar_model_{1}_{2}.p'.format( ALFPY_HOME, filename, tag) print('We will create one and pickle dump it to \n' + pickle_model_name) alfvar = ALFVAR() global use_keys use_keys = [k for k, (v1, v2) in tofit_params.items() if v1 == True] #---------------------------------------------------------------! #---------------------------Setup-------------------------------! #---------------------------------------------------------------! # ---- flag specifying if fitting indices or spectra alfvar.fit_indices = 0 #flag specifying if fitting indices or spectra # ---- flag determining the level of complexity # ---- 0=full, 1=simple, 2=super-simple. See sfvars for details alfvar.fit_type = 0 # do not change; use use_keys to specify parameters # ---- fit h3 and h4 parameters alfvar.fit_hermite = 0 # ---- type of IMF to fit # ---- 0=1PL, 1=2PL, 2=1PL+cutoff, 3=2PL+cutoff, 4=non-parametric IMF alfvar.imf_type = 3 # ---- are the data in the original observed frame? alfvar.observed_frame = 0 alfvar.mwimf = 0 #force a MW (Kroupa) IMF if alfvar.mwimf: alfvar.imf_type = 1 # ---- fit two-age SFH or not? (only considered if fit_type=0) alfvar.fit_two_ages = 1 # ---- IMF slope within the non-parametric IMF bins # ---- 0 = flat, 1 = Kroupa, 2 = Salpeter alfvar.nonpimf_alpha = 2 # ---- turn on/off the use of an external tabulated M/L prior alfvar.extmlpr = 0 # ---- set initial params, step sizes, and prior ranges _, prlo, prhi = set_pinit_priors(alfvar.imf_type) # ---- change the prior limits to kill off these parameters prhi.logm7g = -5.0 prhi.teff = 2.0 prlo.teff = -2.0 # ---- mass of the young component should always be sub-dominant prhi.logfy = -0.5 # ---------------------------------------------------------------! # --------------Do not change things below this line-------------! # ---------------unless you know what you are doing--------------! # ---------------------------------------------------------------! # ---- regularize non-parametric IMF (always do this) alfvar.nonpimf_regularize = 1 # ---- dont fit transmission function in cases where the input # ---- spectrum has already been de-redshifted to ~0.0 if alfvar.observed_frame == 0 or alfvar.fit_indices == 1: alfvar.fit_trans = 0 prhi.logtrans = -5.0 prhi.logsky = -5.0 else: alfvar.fit_trans = 1 # ---- extra smoothing to the transmission spectrum. # ---- if the input data has been smoothed by a gaussian # ---- in velocity space, set the parameter below to that extra smoothing alfvar.smooth_trans = 0.0 if (alfvar.ssp_type == 'cvd'): # ---- always limit the [Z/H] range for CvD since # ---- these models are actually only at Zsol prhi.zh = 0.01 prlo.zh = -0.01 if (alfvar.imf_type > 1): print('ALF ERROR, ssp_type=cvd but imf>1') if alfvar.fit_type in [1, 2]: alfvar.mwimf = 1 #---------------------------------------------------------------! if filename is None: print('ALF ERROR: You need to specify an input file') teminput = input("Name of the input file: ") if len(teminput.split(' ')) == 1: filename = teminput elif len(teminput.split(' ')) > 1: filename = teminput[0] tag = teminput[1] # ---- write some important variables to screen print(" ************************************") if alfvar.fit_indices == 1: print(" ***********Index Fitter*************") else: print(" **********Spectral Fitter***********") print(" ************************************") print(" ssp_type =", alfvar.ssp_type) print(" fit_type =", alfvar.fit_type) print(" imf_type =", alfvar.imf_type) print(" fit_hermite =", alfvar.fit_hermite) print("fit_two_ages =", alfvar.fit_two_ages) if alfvar.imf_type == 4: print(" nonpimf =", alfvar.nonpimf_alpha) print(" obs_frame =", alfvar.observed_frame) print(" mwimf =", alfvar.mwimf) print(" age-dep Rf =", alfvar.use_age_dep_resp_fcns) print(" Z-dep Rf =", alfvar.use_z_dep_resp_fcns) #print(" Ncores = ", ntasks) print(" filename = ", filename, ' ', tag) print(" ************************************") #print('\n\nStart Time ',datetime.now()) #---------------------------------------------------------------! # ---- read in the data and wavelength boundaries alfvar.filename = filename alfvar.tag = tag if alfvar.fit_indices == 0: alfvar = read_data(alfvar) # ---- read in the SSPs and bandpass filters # ------- setting up model arry with given imf_type ---- # if pool_type == 'multiprocessing': from multiprocessing import Pool as to_use_pool else: from schwimmbad import MPIPool as to_use_pool pool = to_use_pool() if pool_type == 'mpi': print('pool size', pool.size) if not pool.is_master(): pool.wait() sys.exit(0) print('\nsetting up model arry with given imf_type and input data\n') tstart = time.time() alfvar = setup(alfvar, onlybasic=False, pool=pool) ndur = time.time() - tstart print('\n Total time for setup {:.2f}min'.format(ndur / 60)) ## ---- This part requires alfvar.sspgrid.lam ---- ## lam = np.copy(alfvar.sspgrid.lam) # ---- interpolate the sky emission model onto the observed wavelength grid # ---- moved to read_data if alfvar.observed_frame == 1: alfvar.data.sky = linterp(alfvar.lsky, alfvar.fsky, alfvar.data.lam) alfvar.data.sky[alfvar.data.sky < 0] = 0. else: alfvar.data.sky[:] = tiny_number alfvar.data.sky[:] = tiny_number # ?? why? # ---- we only compute things up to 500A beyond the input fit region alfvar.nl_fit = min(max(locate(lam, alfvar.l2[-1] + 500.0), 0), alfvar.nl - 1) ## ---- define the log wavelength grid used in velbroad.f90 alfvar.dlstep = (np.log(alfvar.sspgrid.lam[alfvar.nl_fit]) - np.log(alfvar.sspgrid.lam[0])) / (alfvar.nl_fit + 1) for i in range(alfvar.nl_fit): alfvar.lnlam[i] = i * alfvar.dlstep + np.log(alfvar.sspgrid.lam[0]) # ---- convert the structures into their equivalent arrays prloarr = str2arr(switch=1, instr=prlo) prhiarr = str2arr(switch=1, instr=prhi) # ---- this is the master process # ---- estimate velz ---- # print(" Fitting ", alfvar.nlint, " wavelength intervals") nlint = alfvar.nlint l1, l2 = alfvar.l1, alfvar.l2 print('wavelength bourdaries: ', l1, l2) if l2[-1] > np.nanmax(lam) or l1[0] < np.nanmin(lam): print('ERROR: wavelength boundaries exceed model wavelength grid') print(l2[nlint - 1], lam[nl - 1], l1[0], lam[0]) global global_alfvar, global_prloarr, global_prhiarr global_alfvar = copy.deepcopy(alfvar) global_prloarr = copy.deepcopy(prloarr) global_prhiarr = copy.deepcopy(prhiarr) # -------- optimize the first four parameters -------- # len_optimize = 4 prior_bounds = list( zip(global_prloarr[:len_optimize], global_prhiarr[:len_optimize])) if ~alfvar.observed_frame: prior_bounds[0] = (-200, 200) optimize_res = differential_evolution(func_2min, bounds=prior_bounds, disp=True, polish=False, updating='deferred', workers=1) print('optimized parameters', optimize_res) # -------- getting priors for the sampler -------- # global global_all_prior # ---- note it's for all parameters all_key_list = list(tofit_params.keys()) # ---------------- update priors ----------------- # prrange = [10, 10, 0.1, 0.1] global_all_prior = [ClippedNormal(np.array(optimize_res.x)[i], prrange[i], global_prloarr[i], global_prhiarr[i]) for i in range(len_optimize)] + \ [TopHat(global_prloarr[i+len_optimize], global_prhiarr[i+len_optimize]) for i in range(len(all_key_list)-len_optimize)] pickle.dump([alfvar, prloarr, prhiarr, global_all_prior, optimize_res.x], open(pickle_model_name, "wb")) pool.close()
def train_model(para, path_excel): [_,MODEL,LR,LAMDA,LAYER,EMB_DIM,FREQUENCY_USER, FREQUENCY_ITEM, BATCH_SIZE, SAMPLE_RATE,IF_PRETRAIN,N_EPOCH,_,TOP_K,OPTIMIZATION] = para ## Paths of data train_path = DIR+'train_data.json' transformation_bases_path = DIR+'hypergraph_embeddings.json' # transformation bases for graph convolution pre_train_feature_path = DIR+'pre_train_feature'+str(EMB_DIM)+'.json' # to pretrain latent factors for user-item interaction ## Load data # load training data [train_data, train_data_interaction, user_num, item_num] = read_data(train_path) # load pre-trained embeddings for all deep models try: pre_train_feature = read_bases(pre_train_feature_path, EMB_DIM, EMB_DIM) except: print('There is no pre-trained feature found!!') pre_train_feature = [0, 0] IF_PRETRAIN = 0 # load pre-trained transform bases for LCFN if MODEL == 'LCFN': transformation_bases = read_bases(transformation_bases_path, FREQUENCY_USER, FREQUENCY_ITEM) ## Define the model if MODEL == 'BPR': model = model_BPR(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION) if MODEL == 'NCF': model = model_NCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) if MODEL == 'GCMC': model = model_GCMC(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) if MODEL == 'NGCF': model = model_NGCF(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) if MODEL == 'SCF': model = model_SCF(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) if MODEL == 'CGMC': model = model_CGMC(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) if MODEL == 'LCFN': model = model_LCFN(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, graph_embeddings=transformation_bases, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) ## Split the training samples into batches batches = list(range(0, len(train_data_interaction), BATCH_SIZE)) batches.append(len(train_data_interaction)) ## Training iteratively F1_max = 0 F1_df = pd.DataFrame(columns=TOP_K) NDCG_df = pd.DataFrame(columns=TOP_K) for epoch in range(N_EPOCH): t1 = time.clock() for batch_num in range(len(batches)-1): train_batch_data = [] for sample in range(batches[batch_num], batches[batch_num+1]): (user, pos_item) = train_data_interaction[sample] sample_num = 0 while sample_num < SAMPLE_RATE: neg_item = int(random.uniform(0, item_num)) if not (neg_item in train_data[user]): sample_num += 1 train_batch_data.append([user, pos_item, neg_item]) train_batch_data = np.array(train_batch_data) _, loss = sess.run([model.updates, model.loss], feed_dict={model.users: train_batch_data[:,0], model.pos_items: train_batch_data[:,1], model.neg_items: train_batch_data[:,2]}) # test the model each epoch F1, NDCG = test_model(sess, model) t2 = time.clock() F1_max = max(F1_max, F1[0]) # print performance print_value([epoch + 1, loss, F1_max, F1, NDCG]) # save performance F1_df.loc[epoch + 1] = F1 NDCG_df.loc[epoch + 1] = NDCG save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False) if not loss < 10**10: break del model, loss, _, sess gc.collect()
from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.metrics.functional import f1 from pytorch_lightning.metrics.functional.classification import accuracy, multiclass_roc, auc, confusion_matrix from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel from transformers import AdamW, get_linear_schedule_with_warmup import matplotlib.pyplot as plt import seaborn as sn from read_data import * _, _, _, slope_df = read_data('boundary', dir_path='data/', existing_company_only=False, sample=None) _, _, _, test_slope_df = read_test_data('boundary', dir_path='data/', existing_company_only=False, sample=None) slope_df = pd.concat([slope_df, test_slope_df]) company_embedding_df = pd.read_csv('data/company_embedding_centered.csv') class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh()
def evaluate_model(model, tokenizer, trainer, existing_company_only, dir_path='data/', batch_size=16, num_workers=4): (train_ids, X_train, y_train), (val_ids, X_val, y_val), (test_ids, X_test, y_test), _ = read_data( 'correct', dir_path=dir_path, existing_company_only=existing_company_only, sample=None) correct_test_dataloader = tokenize_and_dataloader(X_test, y_test, tokenizer, test_ids, batch_size, num_workers, random=False) (train_ids, X_train, y_train), (val_ids, X_val, y_val), (test_ids, X_test, y_test), _ = read_data( 'inverse', dir_path=dir_path, existing_company_only=existing_company_only, sample=None) inverse_test_dataloader = tokenize_and_dataloader(X_test, y_test, tokenizer, test_ids, batch_size, num_workers, random=False) (train_ids, X_train, y_train), (val_ids, X_val, y_val), (test_ids, X_test, y_test), _ = read_data( 'boundary', dir_path=dir_path, existing_company_only=existing_company_only, sample=None) boundary_test_dataloader = tokenize_and_dataloader(X_test, y_test, tokenizer, test_ids, batch_size, num_workers, random=False) model.incorrect_type = 'correct' trainer.test(model, correct_test_dataloader) model.incorrect_type = 'inverse' trainer.test(model, inverse_test_dataloader) model.incorrect_type = 'boundary' trainer.test(model, boundary_test_dataloader)
mean_y=mean_y_irr, std_y=std_y_irr) file.write(f"Validation Set Error Irr-from-3: {error} \n") error = test_model_from_path(Xs, y_irr, "models/test3/Irr-from-3/", reg=1, mean_y=mean_y_irr, std_y=std_y_irr, Time=list(range(N))) file.write(f"Total Set Error Irr-from-3: {error} \n") if __name__ == "__main__": G1_AS_good, G1_AS_positive_offset, G1_AS_negative_offset, G1_DES_good, G1_DES_positive_offset, G1_DES_negative_offset, G2_good, G2_positive_offset, G2_negative_offset = read_data( ) data_list = [ G1_AS_good[list(G1_AS_good.keys())[-1]], G1_AS_positive_offset[list(G1_AS_positive_offset.keys())[-1]], G1_AS_negative_offset[list(G1_AS_negative_offset.keys())[-1]], G1_DES_good[list(G1_DES_good.keys())[-1]], G1_DES_positive_offset[list(G1_DES_positive_offset.keys())[-1]], G1_DES_negative_offset[list(G1_DES_negative_offset.keys())[-1]], G2_good[list(G2_good.keys())[-1]], G2_positive_offset[list(G2_positive_offset.keys())[-1]], G2_negative_offset[list(G2_negative_offset.keys())[-1]] ] #study_plot(*merge_data(*data_list))
def train_model(para, path_excel): [ DATASET_T, DATASET_S, MODEL, LR_REC, LR_DOM_pos, LR_DOM_neg, LAMDA, LR_REC_s, LAMDA_s, LAYER, EMB_DIM, BATCH_SIZE, SAMPLE_RATE, N_EPOCH, _, TOP_K, OPTIMIZATION, IF_PRETRAIN ] = para ## paths of data train_path_t = '../dataset/' + DATASET_T + '/train_data.json' train_path_s = '../dataset/' + DATASET_S + '/train_data.json' pretrain_path_t = '../dataset/' + DATASET_T + '/latent_embeddings.json' pretrain_path_s = '../dataset/' + DATASET_S + '/latent_embeddings.json' review_path_t = '../dataset/' + DATASET_T + '/review_embeddings.json' review_path_s = '../dataset/' + DATASET_S + '/review_embeddings.json' ## load train data [train_data_t, train_data_interaction_t, user_num_t, item_num_t] = read_data(train_path_t, BATCH_SIZE) [train_data_s, train_data_interaction_s, user_num_s, item_num_s] = read_data(train_path_s, BATCH_SIZE) pretrain_s = read_bases(pretrain_path_s) review_s = read_bases(review_path_s) try: pretrain_t = read_bases(pretrain_path_t) except: print('\n There is no pre-trained feature found !! \n') pretrain_t = [0, 0] IF_PRETRAIN = 0 review_t = read_bases(review_path_t) ## define the model model = model_TDAR(layer=LAYER, n_users_t=user_num_t, n_items_t=item_num_t, n_users_s=user_num_s, n_items_s=item_num_s, emb_dim=EMB_DIM, lr_rec=LR_REC, lr_dom_pos=LR_DOM_pos, lr_dom_neg=LR_DOM_neg, lamda=LAMDA, lr_rec_s=LR_REC_s, lamda_s=LAMDA_s, optimization=OPTIMIZATION, pretrain_t=pretrain_t, pretrain_s=pretrain_s, review_embeddings_t=review_t, review_embeddings_s=review_s, if_pretrain=IF_PRETRAIN) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) ## split the training samples into batches batches = list(range(0, len(train_data_interaction_t), BATCH_SIZE)) batches.append(len(train_data_interaction_t)) ## training iteratively F1_max_t = 0 F1_max_s = 0 pre_i_max = 0 pre_u_max = 0 F1_df = pd.DataFrame(columns=TOP_K) NDCG_df = pd.DataFrame(columns=TOP_K) loss_rec_s = 0 loss_rec_t = 0 for epoch in range(N_EPOCH): rd.shuffle(train_data_interaction_t) rd.shuffle(train_data_interaction_s) for batch_num in range(len(batches) - 1): train_batch_data_t = [] train_batch_data_s = [] for sample_t in range(batches[batch_num], batches[batch_num + 1]): (user_t, pos_item_t) = train_data_interaction_t[sample_t] sample_s = random.randint(0, len(train_data_interaction_s) - 1) (user_s, pos_item_s) = train_data_interaction_s[sample_s] train_batch_data_t.append([user_t, pos_item_t, 1, 1]) train_batch_data_s.append([user_s, pos_item_s, 1, 0]) # add negatives to the target domain sample_num = 0 while sample_num < SAMPLE_RATE[0]: neg_item_t = int(random.uniform(0, item_num_t)) if not (neg_item_t in train_data_t[user_t]): sample_num += 1 train_batch_data_t.append([user_t, neg_item_t, 0, 1]) # add negatives to the source domain sample_num = 0 while sample_num < SAMPLE_RATE[1]: neg_item_s = int(random.uniform(0, item_num_s)) if not (neg_item_s in train_data_s[user_s]): sample_num += 1 train_batch_data_s.append([user_s, neg_item_s, 0, 0]) train_batch_data_t = np.array(train_batch_data_t) train_batch_data_s = np.array(train_batch_data_s) try: [ update1_t, update1_s, update2_u, update2_i, update3_u, update3_i, loss_rec_t, loss_rec_s, loss_domain_u, loss_domain_i ] = sess.run( [ model.update1_t, model.update1_s, model.update2_u, model.update2_i, model.update3_u, model.update3_i, model.loss_rec_t, model.loss_rec_s, model.loss_domain_u, model.loss_domain_i ], feed_dict={ model.users_t: train_batch_data_t[:, 0], model.items_t: train_batch_data_t[:, 1], model.rec_label_t: train_batch_data_t[:, 2], model.domain_label_t: train_batch_data_t[:, 3], model.users_s: train_batch_data_s[:, 0], model.items_s: train_batch_data_s[:, 1], model.rec_label_s: train_batch_data_s[:, 2], model.domain_label_s: train_batch_data_s[:, 3] }) except: update, loss_rec = sess.run( [model.updates, model.loss], feed_dict={ model.users: train_batch_data_t[:, 0], model.items: train_batch_data_t[:, 1], model.labels: train_batch_data_t[:, 2] }) F1_t, NDCG_t = test_model(sess, model, 't') F1_max_t = max(F1_max_t, F1_t[0]) F1_s, NDCG_s = test_model(sess, model, 's') F1_max_s = max(F1_max_s, F1_s[0]) pre_u, pre_i = test_domain(sess, model) ## print performance print_value([ epoch + 1, loss_rec_t, loss_rec_s, loss_domain_u, loss_domain_i, F1_max_t, F1_max_s, pre_u, pre_i, F1_t, NDCG_t, F1_s, NDCG_s ]) ## save performance F1_df.loc[epoch + 1] = F1_t NDCG_df.loc[epoch + 1] = NDCG_t save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False) if not (loss_rec_s + loss_rec_t) < 10**10: break
minEntries.append(entry) elif d == minDist: minEntries.append(entry) total = 0 print "nearest neighbors : ", minEntries for entry in minEntries: total += mood_rating(entry.get_mood()) return mood(ceil(total / len(minEntries))) def dist(list1, list2): dist = 0 total = 0 same = 0 for item in list1: if item not in list2: dist += 1 else: same += 1 total += 1 dist += (len(list2) - same) total += len(list2) return dist/float(total) entries = read_data(sys.argv[1]) inputs = read_data(sys.argv[2]) for item in inputs: print item print nearest_neighbor(entries, item)
prev_cost = _cost nth += 1 return weights def train(X_train, Y_train): print('Start training model...') W = gradient_descent_SGD(X_train, Y_train, learning_rate, max_epochs, cost_threshold) print('Train finished!') return W if __name__ == "__main__": # read data X, Y = read_data('data.csv') # split data for training and testing 80:20 # first insert 1 in every row for intercept b X.insert(loc=len(X.columns), column='intercept', value=1) print("splitting dataset into train and test sets...") X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # regulaziration parameter C = 10000 learning_rate = 0.000001 max_epochs = 5000
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets, user_ids = load_data("test_feature1") train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size + 1 params, nerual_num = read_param(sys.argv[1]) ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP( rng=rng, input=x, n_in=nerual_num[0], n_hidden=nerual_num[1:-1], n_out=nerual_num[-1], params = params ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.Y_pred(), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, on_unused_input='ignore' ) ############### # TEST MODEL # ############### print('... testing') # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False # test it on the test set y_pred = [] for i in range(n_test_batches): y_pred += list(test_model(i)) #y_pred = [test_model(i) for i in range(n_test_batches)] print len(y_pred) ##test wb_Embedfile = "test_feature"#"../jd_wfeature" jd_wb_linkfile = "network"#"../../net_work10w" test_user =user_ids# get_user(wb_Embedfile, 0) user_pro = read_data("../../user_product_list10w", test_user) print len(user_pro) test_nega = readpro_nega("test_nega.txt") cate_users = get_cate_users(jd_wb_linkfile) test_itemPre(test_user, test_nega, cate_users, user_pro, y_pred)
We embeded the POS information to improve the model and provide some punish options ''' from read_data import * import collections as cc from HMM import * import numpy as np import time from add_pos import * import matplotlib.pyplot as plt train_arg_sent,train_tri_sent,test_arg_sent,\ test_tri_sent,train_trigger,train_argument,set_arg,set_tri, \ argument_tuple, trigger_tuple,vocab_arg,vocab_tri=read_data() param_arg = { 'type': 'arg', 'conf_prob': cc.Counter(argument_tuple), 'arg': np.array(list(set_arg)), 'vocab': np.array(list(vocab_arg)), 'Naive_bayes': 0, 'k': 10**5, 'O_punish': 0.5, 'diag_punish': 0.5, 'conf_mat': np.zeros([(len(vocab_arg)), (len(set_arg))]), 'k_conf_mat': 10**-5, 'lambda': 5, 'lambda2': 2 }
def train_model(para, path_excel, if_save_emb): [ _, MODEL, LR, LAMDA, EMB_DIM, BATCH_SIZE, SAMPLE_RATE, N_EPOCH, _, _, ] = para ## paths of data train_path = DIR + 'train_data.json' save_text_embeddings_path = DIR + 'review_embeddings.json' save_latant_embeddings_path = DIR + 'latent_embeddings.json' text_embeddings_path = DIR + 'text.json' user_review_path = DIR + 'user_text.json' item_review_path = DIR + 'item_text.json' ## load train data [train_data, train_data_interaction, user_num, item_num] = read_data(train_path, BATCH_SIZE) if MODEL == 'TMN': text_matrix = load_features(text_embeddings_path) text_matrix = text_matrix.astype(np.float32) user_review = load_data(user_review_path) item_review = load_data(item_review_path) user_word_num = 0 for review in user_review: user_word_num = max(len(review), user_word_num) item_word_num = 0 for review in item_review: item_word_num = max(len(review), item_word_num) user_word_num = min(user_word_num, 200) item_word_num = min(item_word_num, 200) user_review_feature = np.ones((user_num, user_word_num)) item_review_feature = np.ones((item_num, item_word_num)) for user in range(user_num): user_review_feature[user] = assignment(user_review_feature[user], user_review[user]) for item in range(item_num): item_review_feature[item] = assignment(item_review_feature[item], item_review[item]) ## define the model if MODEL == 'MF': model = model_MF(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA) if MODEL == 'TMN': model = model_TMN(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, text_embeddings=text_matrix, user_word_num=user_word_num, item_word_num=item_word_num) if MODEL == 'TMF': review_embeddings = read_features(save_text_embeddings_path) model = model_TMF(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, review_embeddings=review_embeddings) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) ## split the training samples into batches batches = list(range(0, len(train_data_interaction), BATCH_SIZE)) batches.append(len(train_data_interaction)) ## training iteratively F1_max = -10 F1_df = pd.DataFrame(columns=TOP_K) NDCG_df = pd.DataFrame(columns=TOP_K) for epoch in range(N_EPOCH): for batch_num in range(len(batches) - 1): train_batch_data = [] if MODEL == 'TMN': user_review_batch = np.ones( ((1 + SAMPLE_RATE) * (batches[batch_num + 1] - batches[batch_num]), user_word_num)) item_review_batch = np.ones( ((1 + SAMPLE_RATE) * (batches[batch_num + 1] - batches[batch_num]), item_word_num)) num = 0 for sample in range(batches[batch_num], batches[batch_num + 1]): user, pos_item = train_data_interaction[sample] sample_num = 0 train_batch_data.append([user, pos_item, 1]) if MODEL == 'TMN': user_review_batch[num] = user_review_feature[user] item_review_batch[num] = item_review_feature[pos_item] num += 1 while sample_num < SAMPLE_RATE: neg_item = int(random.uniform(0, item_num)) if not (neg_item in train_data[user]): sample_num += 1 train_batch_data.append([user, neg_item, 0]) if MODEL == 'TMN': user_review_batch[num] = user_review_feature[user] item_review_batch[num] = item_review_feature[ neg_item] num += 1 train_batch_data = np.array(train_batch_data) try: _, loss = sess.run( [model.updates, model.loss], feed_dict={ model.users: train_batch_data[:, 0], model.items: train_batch_data[:, 1], model.label: train_batch_data[:, 2], model.user_word: user_review_batch, model.item_word: item_review_batch }) except: _, loss = sess.run( [model.updates, model.loss], feed_dict={ model.users: train_batch_data[:, 0], model.items: train_batch_data[:, 1], model.label: train_batch_data[:, 2] }) if MODEL == 'TMN': F1, NDCG = test_model(sess, model, user_review_feature, item_review_feature) else: F1, NDCG = test_model(sess, model, 0, 0) if F1_max < F1[0]: F1_max = F1[0] if if_save_emb == 1: try: user_text_embedding = np.zeros( (user_num, np.shape(text_matrix)[1])) item_text_embedding = np.zeros( (item_num, np.shape(text_matrix)[1])) user_batch_list = list(range(0, user_num, 500)) user_batch_list.append(user_num) item_batch_list = list(range(0, item_num, 500)) item_batch_list.append(item_num) for u in range(len(user_batch_list) - 1): u1, u2 = user_batch_list[u], user_batch_list[u + 1] user_batch = np.array(range(u1, u2)) user_review_batch = user_review_feature[u1:u2] u_text_embedding = sess.run( [model.u_text_embeddings], feed_dict={ model.users: user_batch, model.user_word: user_review_batch }) user_text_embedding[u1:u2] = u_text_embedding[0] for i in range(len(item_batch_list) - 1): i1, i2 = item_batch_list[i], item_batch_list[i + 1] item_batch = np.array(range(i1, i2)) item_review_batch = item_review_feature[i1:i2] i_text_embedding = sess.run( [model.i_text_embeddings], feed_dict={ model.items: item_batch, model.item_word: item_review_batch }) item_text_embedding[i1:i2] = i_text_embedding[0] except: user_embedding, item_embedding = sess.run( [model.user_embeddings, model.item_embeddings]) ## print performance print_value([epoch + 1, loss, F1_max, F1, NDCG]) F1_df.loc[epoch + 1] = F1 NDCG_df.loc[epoch + 1] = NDCG save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False) if not loss < 10**10: break if if_save_emb == 1: if MODEL == 'TMN': save_embeddings( [user_text_embedding.tolist(), item_text_embedding.tolist()], save_text_embeddings_path) if MODEL == 'TMF': save_embeddings([user_embedding.tolist(), item_embedding.tolist()], save_latant_embeddings_path) try: del u_text_embedding, i_text_embedding, user_text_embedding, item_text_embedding except: del user_embedding, item_embedding del model, loss, _, sess gc.collect()
# and its values are dicts of stems. We don't need both this and Signatures! #print config_lxa["affix_type"], 51 Lexicon = CLexicon() Lexicon.infolder = config_lxa["complete_infilename"] Lexicon.outfolder = config_lxa["outfolder"] Lexicon.graphicsfolder = config_lxa["graphicsfolder"] if config_lxa["affix_type"] == "prefix": Lexicon.FindSuffixesFlag = False else: Lexicon.FindSuffixesFlag = True # ---------------------.-----------------------------------------------## # read wordlist (dx1) # --------------------------------------------------------------------## infile = open(config_lxa["complete_infilename"]) read_data(config_lxa["datatype"], infile, Lexicon, config_lxa["BreakAtHyphensFlag"], config_lxa["word_count_limit"]) print "\n1. Finished reading word list. Word count:", len( Lexicon.Word_counts_dict), "\n" # --------------------------------------------------------------------## # Initialize some output files # --------------------------------------------------------------------## if not os.path.isdir(config_lxa["outfolder"]): try: os.mkdir(config_lxa["outfolder"]) except OSError: print("Creation of the directory %s failed." % path) else: print("Successfully created the directory %s ." % path)
# Module 2: Interpolate C-Content & d13C Ratios on cm scale # Assuming: C(z) = a*np.exp(-b*x) + c *** from read_data import * import matplotlib.pyplot as plt import numpy as np import csv #read_data(Folder_File = 'Raw_Data/forest.csv') plot_title = "primary forest" depth, C_content, C_se, d13C, d13Cse = read_data(Folder_File = 'Raw_Data/forest.csv') from scipy.optimize import curve_fit def func(x, a, b, c): return a*np.exp(-b*x) + c def interpolation(): global plot_title x = depth yC = C_content popt, pcov = curve_fit(func, x, yC) print print "interpolation module ran successfuly " print "further calculation assume:" print
from read_data import * from preprocessing import * from run_gams_model import * import numpy as np import pandas as pd data_raw,features,features_all=read_data() features_selected=features_all.drop(features_all.index[[37,50,51,72,73]]) # run for CV complication proc_data=gen_proc_data(data_raw,features_selected,data_raw['cv_comp_new']) models_cvcomp=get_all_gams_model(proc_data,features_selected) best_model_cvcomp=get_best_model(models_cvcomp) r_df_cvcomp = com.convert_to_r_dataframe(proc_data) pred_score_cvcomp= statsf.predict(best_model_cvcomp,r_df_cvcomp, type="response") pred_obser_cvcomp=proc_data['outcome']; best_model_metric=calculate_metric(pred_obser_cvcomp,pred_score_cvcomp) cutoff1_cvcomp=best_model_metric['thres'].iloc[best_model_metric['yod_index'].idxmax()-1] cutoff2_cvcomp=cal_cutoff2(best_model_metric) predicted_values_cvcomp =pd.DataFrame({'prediction':pred_score_cvcomp}, index=range(0,len(pred_score_cvcomp))) predicted_values_cvcomp.ix[predicted_values_cvcomp.prediction <= cutoff1_cvcomp 'category'] = 'low' predicted_values_cvcomp.ix[predicted_values_cvcomp.prediction >cutoff2_cvcomp, 'category'] = 'high' predicted_values_cvcomp.ix[(predicted_values_cvcomp.prediction <=cutoff2_cvcomp) & (predicted_values_cvcomp.prediction>cutoff1_cvcomp), 'category'] = 'moderate' # run for MV complication proc_data=gen_proc_data(data_raw,features_selected,data_raw['MV_comp']) models_mvcomp=get_all_gams_model(proc_data,features_selected) best_model_mvcomp=get_best_model(models_mvcomp) r_df_mvcomp = com.convert_to_r_dataframe(proc_data)
def train(): global_step = tf.Variable(0, name='global_step', trainable= False) train_dir = './logs' #z表示随机噪声,y表示约束条件 y = tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y') images = tf.placeholder(tf.float32, [64,28,28,1], name='real_images') z = tf.placeholder(tf.float32, [None, 100], name='z') #由生成器生成图像G with tf.variable_scope("for_reuse_scope"): G = generator(z, y) #真实图像送入判别器 D, D_logits = discriminator(images, y) samples = sampler(z,y) D_, D_logits_ = discriminator(G, y, reuse=True) d_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=D_logits, logits= tf.ones_like(D))) d_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=D_logits_, logits=tf.zeros_like(D_))) d_loss = d_loss_real + d_loss_fake g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=D_logits_, logits=tf.ones_like(D_))) z_sum = tf.summary.histogram("z",z) d_sum = tf.summary.histogram("d",D) d__sum = tf.summary.histogram("d_", D_) G_sum = tf.summary.image("G",G) d_loss_real_sum = tf.summary.scalar("d_loss_real", d_loss_real) d_loss_fake_sum = tf.summary.scalar("d_loss_fake", d_loss_fake) d_loss_sum = tf.summary.scalar("d_loss", d_loss) g_loss_sum = tf.summary.scalar("g_loss", g_loss) #合并各自的总结 g_sum = tf.summary.merge([z_sum, d__sum, G_sum, d_loss_fake_sum, g_loss_sum]) d_sum = tf.summary.merge([z_sum, d_sum, d_loss_real_sum, d_loss_sum]) #生成器和判别器要更新的变量,由于tf.train.Optimizer的var_list t_vars = tf.trainable_variables() d_vars = [var for var in t_vars if 'd_' in var.name] g_vars = [var for var in t_vars if 'g_' in var.name] saver = tf.train.Saver() #优化算法采用Adam d_optim = tf.train.AdamOptimizer(0.0002, beta1= 0.5).minimize(d_loss, var_list=d_vars, global_step=global_step) g_optim = tf.train.AdamOptimizer(0.0002, beta1= 0.5).minimize(g_loss, var_list=g_vars, global_step=global_step) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction=0.2 sess = tf.InteractiveSession(config=config) init = tf.initialize_all_variables() writer = tf.summary.FileWriter(train_dir, sess.graph) data_x, data_y = read_data() sample_z = np.random.uniform(-1, 1,size=(BATCH_SIZE, 100)) sample_labels = data_y[0:64] sess.run(init) for epoch in range(25): batch_idxs = 1093 for idx in range(batch_idxs): batch_images = data_x[idx * 64 : (idx + 1) *64] batch_labels = data_y[idx * 64 : (idx + 1) * 64] batch_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100)) #uniform 在(-1,1)随机采样 #更新D的参数 _, summary_str = sess.run([d_optim, d_sum], feed_dict={images:batch_images, z:batch_z, y:batch_labels}) writer.add_summary(summary_str, idx+1) #更新G的参数 _, summary_str = sess.run([g_optim, g_sum], feed_dict={z:batch_z, y:batch_labels}) writer.add_summary(summary_str, idx+1) #更新两次G的参数确保网络稳定 _, summary_str = sess.run([g_optim, g_sum], feed_dict={z:batch_z, y:batch_labels}) writer.add_summary(summary_str, idx+1) #计算训练过程中的损失,打印出来 errD_fake = d_loss_fake.eval({z:batch_z, y:batch_labels})#执行字符串表达式,并返回表达式的值 errD_real = d_loss_real.eval({images:batch_images, y:batch_labels}) errG = g_loss.eval({z:batch_z, y:batch_labels}) if idx %20 ==0: print("Epoch : [%2d] [%4d/%4d] d_loss: %.8f, g_loss:%.8f"\ % (epoch, idx, batch_idxs, errD_fake+errD_real, errG)) if idx %100 ==1: sample = sess.run(samples, feed_dict={z:sample_z, y:sample_labels}) samples_path = os.getcwd()+'/samples/' save_image(sample, [8,8], samples_path + 'test_%d_epoch_%d.png' %(epoch, idx)) print('save down') if idx % 500 ==2: checkpoint_path = os.path.join(train_dir, 'DCGAN_model.ckpt') saver.save(sess, checkpoint_path, global_step=idx+1) sess.close()
# 1- https://www.kaggle.com/charma69/titanic-data-science-solutions/edit import pandas as pd import numpy as np from sklearn import preprocessing from sklearn.linear_model import Perceptron from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn.metrics import roc_auc_score, accuracy_score from matplotlib import pyplot as plt from read_data import * from sklearn import metrics # 1.import data dataset = 'WBCD.csv' Data = read_data(dataset) # 2.Fill with median Data = fillMed(Data) #3.data precessing # select features and Normalization x = Data.loc[:, [ 'ClumpTkns', 'UnofCSize', 'UnofCShape', 'MargAdh', 'SngEpiCSize', 'BareNuc', 'BlandCrmtn', 'NrmlNuc', 'Mitoses' ]] y = Data['Malignant'] # TRANSFROM GIVES 1 % LESS ACCURATE RESULT!!!! min_max_scaler = preprocessing.MaxAbsScaler() x = min_max_scaler.fit_transform(x)
def main(args): word_embeddings = p.load(open(args.word_embeddings, 'rb')) word_embeddings = np.array(word_embeddings) word2index = p.load(open(args.vocab, 'rb')) index2kwd, kwd2index, index2cnt = read_kwd_vocab(args.kwd_vocab) if hparams.BALANCE_KWD_CLASS: # adjust weight for different kwd class based on median freqency index2cnt = np.array(index2cnt) base_freq = np.median(index2cnt) kwd_weight = np.sqrt(base_freq / index2cnt) kwd_weight = torch.FloatTensor(kwd_weight) if hparams.USE_CUDA: kwd_weight = kwd_weight.cuda() else: kwd_weight = None subset_count = args.subset_count if args.subset_count > 0 else None train_data = read_data(args.train_context, args.train_question, args.train_ids, args.max_post_len, args.max_ques_len, subset_count) test_data = read_data(args.tune_context, args.tune_question, args.tune_ids, args.max_post_len, args.max_ques_len, subset_count) if args.kwd_data_dir: # load pre-extracted kwd, save time in training print(f"load kwds from {args.kwd_data_dir}") train_kwds = read_kwds(os.path.join(args.kwd_data_dir, "train.kwds"), kwd2index, subset_count) test_kwds = read_kwds(os.path.join(args.kwd_data_dir, "tune.kwds"), kwd2index, subset_count) assert len(train_kwds) == len(train_data), print( len(train_kwds), len(train_data)) assert len(test_kwds) == len(test_data) else: train_kwds, test_kwds = None, None print('No. of train_data %d' % len(train_data)) print('No. of test_data %d' % len(test_data)) print("Preprocessing train") q_train_data = preprocess_data(train_data, word2index, kwd2index, args.max_post_len, args.max_ques_len, args.kwd_data_dir, extract_kwd=False) q_train_data = [np.array(x) for x in q_train_data] print("Preprocessing val") q_test_data = preprocess_data(test_data, word2index, kwd2index, args.max_post_len, args.max_ques_len, args.kwd_data_dir, extract_kwd=False) q_test_data = [np.array(x) for x in q_test_data] if args.pretrain_ques: run_seq2seq(q_train_data, q_test_data, word2index, word_embeddings, hparams.MAX_QUES_LEN, kwd_weight, not args.freeze_kwd_model, train_kwds, test_kwds, kwd2index, args.kwd_model_dir, args.save_dir, args.load_models_dir) elif args.pretrain_kwd: run_kwd(q_train_data, q_test_data, index2kwd, word_embeddings, kwd_weight, train_kwds, test_kwds, kwd2index, args.save_dir) else: print('Please specify model to pretrain') return
>>> A = np.array([(2,4,1,3),(-1,-2,1,0),(0,0,2,2),(3,6,2,5)]) >>> independent_columns(A) np.array([[1, 4], [2, 5], [3, 6]]) """ Q, R = linalg.qr(A) independent = np.where(np.abs(R.diagonal()) > tol)[0] #print independent return independent #return A[:, independent] if __name__ == "__main__": data = read_data(_TRAINING_FILE_NAME_) #print data.describe() train_labels = data.label train_labels = train_labels.reshape(train_labels.size, 1) train_data = data.drop("label", 1) train_data = train_data.drop(BLACKLIST, axis=1) ft = open(_FIELDS_FILE_, "r") categ = [] # list of categorical variables for transform non_categ = [] categ = [] for line in ft.readlines(): splits = line.split() if splits[1] == "numeric": non_categ.append(splits[0])
from matplotlib import pyplot from read_data import * from Scratch import * from numpy import * scratchList = read_data( "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\cer biala\\wynik-ce-bi.txt" ) scratchList1 = read_data( "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\cer czarna\\wynik-ce-cz.txt" ) scratchList2 = read_data( "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\alu cylinder\\wynik-al-cy.txt" ) scratchList3 = read_data( "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\alu gwint\\wynik-al-gw.txt" ) # sampleList = [scratchList, scratchList1] # sampleList = [scratchList2, scratchList3] sampleList = [scratchList, scratchList1, scratchList2, scratchList3] labelList = [ 'white coating', 'black coating', '5056 aluminium', '7075 aluminium' ] colorList = ['red', 'black', 'blue', 'green'] avgMaxDepth = [] maxDepth = [] #for each sample for scratchList in sampleList: sum1 = 0
from __future__ import print_function import numpy as np import sys sys.path.append('../geom') from point import * from fitsemivariance import * from semivariance import * from covariance import * from read_data import * from prepare_interpolation_data import * from okriging import * Z = read_data('../data/necoldem250.dat') hh = 50 lags = np.arange(0, 3000, hh) test_results = [] N = len(Z) mask = [True for i in range(N)] numNeighbors = 10 for i in range(N): mask[i] = False x = Point(Z[i][0], Z[i][1]) P = [Z[j] for j in range(N) if mask[j] == True] P1 = prepare_interpolation_data(x, P, numNeighbors)[0] P1 = np.array(P1) gamma = semivar(P1, lags, hh) #*@\label{krig:cross:semivar} if len(gamma) == 0: continue semivariogram = fitsemivariogram(P1, gamma, spherical)
from matplotlib import pyplot from read_data import * from Scratch import * from numpy import * scratchList = read_data("C:\\Users\\ja\\Desktop\\Aleksander\\Scratch\\cer biala\\wynik-ce-bi.txt") scratchList1 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\1\\krzywe.txt") scratchList2 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\2\\krzywe.txt") scratchList3 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\3\\krzywe.txt") scratchList4 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\4\\krzywe.txt") sampleList = [scratchList,scratchList1,scratchList2,scratchList3,scratchList4] avgMaxDepth = [] maxDepth = [] #for each sample for scratchList in sampleList: sum1 = 0 maxList = [] #for each scratch for scratchObject in scratchList: #adding base line scratchObject.addBaseline() #moving average for depth for topography 2 (after scratch) scratchObject.topo2.depth = convolve(scratchObject.topo2.depth, ones(30,)/30, mode='full') #computing max dephts sum1 = sum1 + scratchObject.maxDepthOfTopo2() maxList.append(scratchObject.maxDepthOfTopo2())
from read_data import * from tensorflow.keras.models import load_model from tensorflow.keras.layers import Dense, Input, Dropout from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam model_name = "FeedForward" data = read_data('test') X_test = data.to_numpy() model = load_model('Models/' + model_name + '.hdf5') predictions = model.predict(X_test) predictions = np.around(predictions).flatten() predictions = predictions.astype(int) d = {'PassengerId': range(892, 1310), 'Survived': predictions} df = pd.DataFrame(data=d) df.to_csv('Results/' + model_name + '.csv', index=False)
def get_cleaned_data(path): df = read_data(path) df['review_cleaned'] = df['review'].apply(lambda x: clean_text(x)) return df