def train_modelA(target_dirs, non_target_dirs, tgauss=4, ngauss=11): target_coef = [] target_frequency = [] print("Target data:") for dir_name in target_dirs: for f in glob(dir_name + '/*.wav'): print('Processing file: ', f) fs, f = wavfile.read(f) mfcc, freq = ft.features(f, fs) target_coef.append(mfcc) target_frequency.append(freq) target_coef = np.vstack(target_coef) target_frequency = np.array(target_frequency) non_target_coef = [] non_target_frequency = [] print("Non target data:") for dir_name in non_target_dirs: for f in glob(dir_name + '/*.wav'): print('Processing file: ', f) fs, f = wavfile.read(f) mfcc, freq = ft.features(f, fs) non_target_coef.append(mfcc) non_target_frequency.append(freq) non_target_coef = np.vstack(non_target_coef) non_target_frequency = np.array(non_target_frequency) print("Training gaussian distribution for frequency") mu_freq1 = target_frequency.mean() mu_freq2 = non_target_frequency.mean() cov_freq1 = target_frequency.var() cov_freq2 = non_target_frequency.var() # Initialize mean vectors to randomly selected data points from corresponding class # Initialize all covariance matrices to the same covariance matrices computed using # all the data from the given class print("Training GMM for mfcc") m1 = tgauss mus1 = target_coef[np.random.randint(1, target_coef.shape[1], m1)] cov1 = np.cov(target_coef.T, bias=True) covs1 = [cov1] * m1 ws1 = np.ones(m1) / m1 m2 = ngauss mus2 = non_target_coef[np.random.randint(1, non_target_coef.shape[1], m2)] cov2 = np.cov(non_target_coef.T, bias=True) covs2 = [cov2] * m2 ws2 = np.ones(m2) / m2 for i in range(30): ws1, mus1, covs1, ttl1 = ft.train_gmm(target_coef, ws1, mus1, covs1) ws2, mus2, covs2, ttl2 = ft.train_gmm(non_target_coef, ws2, mus2, covs2) print("target error:", ttl1, "non target error: ", ttl2) return (mu_freq1, mu_freq2), (cov_freq1, cov_freq2), (mus1, mus2), (covs1, covs2), (ws1, ws2)
def if_c(): if language == 'EN': print(c.BLUE+'clear '+c.RED+'screen'+c.BLUE+' .'+c.GREEN+'.'+c.RED+'.') sleep(2) clear() features() elif language == 'FA': print(c.RED+'tamiz'+c.BLUE+' kardan'+c.RED+' safhe '+c.BLUE+' .'+c.GREEN+'.'+c.RED+'.') sleep(2) clear() features()
def featpyramid(pic,model): #????? pyra = {} padx = math.ceil(model["maxsize"][0][0][0][1]) pady = math.ceil(model["maxsize"][0][0][0][0]) sbin = model["sbin"][0][0][0][0] interval = model["interval"][0][0][0][0] sc = 2.0 **(1.0/interval) imsize = [pic.shape[1],pic.shape[2]] max_scale = int(1 + np.floor(math.log(min(imsize)/(5.0*sbin))/math.log(sc))) pyra["feat"] = list(range(int(max_scale + interval))) pyra["scales"] = np.zeros((max_scale + interval, 1)) pyra["imsize"] = imsize time = 0 for i in range(interval): starttime = datetime.datetime.now() scaled = resize.resize(pic,1.0/sc**i) endtime = datetime.datetime.now() tmp = features.features(scaled,sbin/2.0) time += (endtime - starttime).seconds size =[tmp.shape[0],tmp.shape[1]+2*pady+2,tmp.shape[2]+2*padx+2] pyra["feat"][i]=np.zeros(size) pyra["feat"][i][:,pady+1:size[1]-pady-1,padx+1:size[2]-padx-1] = tmp pyra["scales"][i] = 2.0/sc**(i) #starttime = datetime.datetime.now() tmp = features.features(scaled,sbin) #endtime = datetime.datetime.now() #time += (endtime - starttime).seconds size =[tmp.shape[0],tmp.shape[1]+2*pady+2,tmp.shape[2]+2*padx+2] pyra["feat"][i+interval]=np.zeros(size) pyra["feat"][i+interval][:,pady+1:size[1]-pady-1,padx+1:size[2]-padx-1] = tmp pyra["scales"][i+interval] = 1.0/sc**(i-1) for j in range(i+interval,max_scale,interval): starttime = datetime.datetime.now() scaled = resize.resize(scaled, 0.5) endtime = datetime.datetime.now() tmp = features.features(scaled,sbin) time += (endtime - starttime).seconds size =[tmp.shape[0],tmp.shape[1]+2*pady+2,tmp.shape[2]+2*padx+2] pyra["feat"][j+interval]=np.zeros(size) pyra["feat"][j+interval][:,pady+1:size[1]-pady-1,padx+1:size[2]-padx-1] = tmp pyra["scales"][j+interval] = 0.5/sc**(i-1) for i in range(len(pyra["feat"])): pyra["feat"][i][31,0:pady+1,:]=1 end=pyra["feat"][i].shape pyra["feat"][i][31,end[1]-padx-1:end[1],:]=1 pyra["feat"][i][31,:,0:padx+1]=1 pyra["feat"][i][31,:,end[2]-pady-1:end[2]]=1 print time pyra["padx"] = padx pyra["pady"] = pady return pyra
def dx(dx_features_fname, dx_features_split_fname, split_fname, feature_diseases, db, training_data_fname, time_scale_days, verbose=True): feature_loincs = [] feature_drugs = [] training_data = pd.read_csv(training_data_fname, sep='\t', dtype=str) # we want to relate the presence or absence of diagnoses in the outcome window the presence or absence of the label which is calculated based on codes in the outcome window training_data = training_data[['person','y','outcome_start_date','outcome_end_date','age','gender']] training_data.columns = ['person','y','training_start_date','training_end_date','age','gender'] features.features(db, training_data, feature_loincs, feature_diseases, feature_drugs, time_scale_days, dx_features_fname, calc_gfr=False, verbose=verbose, add_age_sex=False) features.split(dx_features_fname, dx_features_split_fname, split_fname, verbose)
def processdata(urllists, word_count_threshold, depth): content = [] nums = [] nums.append(0) for url in urllists: crawler = webCrawler(url, depth) crawler.crawl() nums.append(len(crawler.data)) content.extend(crawler.data) instance = features(word_count_threshold) word_counts, wordtoix = instance.extractwords(content) N = len(word_counts) for i in range(1, len(nums)): nums[i] = nums[i-1] + nums[i] cid = 0 output = np.zeros((nums[len(nums)-1], N+1)) for url in urllists: crawler = webCrawler(url, depth) crawler.crawl() currlen = len(crawler.data) feats = instance.bagofwords(crawler.data, word_counts, wordtoix) print feats.shape b = np.zeros((currlen,N+1)) print b[:, :-1].shape b[:,0:N] = feats b[:,N] = cid +1 output[nums[cid]:nums[cid+1],:] = b cid = cid + 1 np.savetxt('test.out', output, delimiter=',') # X is an array
def eval(rules, hand): score = 0 for feat in features.features(hand): score += rules[feat] return score
def featureExtraction(): ''' Extract features and save :param recordings: :param varin: :return: ''' recordings = getRecordings(wav_path) for recording in recordings: wav_file = os.path.join(wav_path,recording+'.wav') energy_filename = os.path.join(feature_path,'energy'+'_'+recording+'_' +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy') spec_filename = os.path.join(feature_path,'spec'+'_'+recording+'_' +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy') for featurename in feature_set: print 'saving feature for ', recording, ', feature ', featurename feature_filename = os.path.join(feature_path,featurename+'_'+recording+'_' +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy') varin['feature_select'] = featurename feature, energy, spec = features.features(wav_file,varin) np.save(feature_filename,feature) if featurename == feature_set[-1]: np.save(energy_filename,energy) np.save(spec_filename,spec)
def read_dataset(): df = pd.read_csv("yahoostock.csv") X,y = features.features("yahoostock.csv") df = df.iloc[::-1] date = df[df.columns[0]] close = df[df.columns[5]].values close = close[1:] y=close date=date.transpose() date = date[27:5010] X = X[27:5010] print(X.shape) X = pd.DataFrame(X) # normalize data min_max_scaler = preprocessing.MinMaxScaler() np_scaled = min_max_scaler.fit_transform(X) X = pd.DataFrame(np_scaled) X = X.values y = y[27:5010] print(y.shape) return (X,y,date)
def __init__(self, path): obj = features() STEPS = 47 self.data = [] # (total, 2, 47, 60) self.batch_id = 0 self.data_label = [] self.data_length = [] file = open(path, 'r') num = 0 for line in file.readlines(): li = line.strip().split('\t') if not len(li) > 0: continue if li[0] == '1': self.data_label.append(1) else: self.data_label.append(0) _t, _seq_len = obj.word2vec_list(li[1], li[2]) t1 = _t[0] t2 = _t[1] while len(t1) < STEPS: t1.append(np.zeros((60), dtype=np.float32)) while len(t2) < STEPS: t2.append(np.zeros((60), dtype=np.float32)) self.data.append([t1, t2]) self.data_length.append(_seq_len) num += 1
def get_featureset( self, promo=None ): """ Grabs an appropriate featureset. """ # Short-circuit if a featureset has already been established if hasattr( self, "fs" ): return getattr( self, "fs" ) allfeatures = features() # Parse out the first path segment and use it if it's a legit promo m = re.compile("^/?([^/\\?]*)").findall( self.request.path ) path = m[0] or "default" if m and len(m) else None promo = path if path in allfeatures else None # If the path doesn't indicate a promotion, check params and cookies promo = promo or self.request.params.get("promo") or self.request.cookies.get("promo") # Retrieve featureset key = self.request.params.get("key") or self.request.cookies.get("key") fs = featureset.featureset( key, promo ) # Set HTTP cookies to remember the key and promotion self.set_cookie( "key", fs.key().id_or_name() ) if hasattr( fs, "promo" ): self.set_cookie( "promo", fs.promo ) # Set local reference to featureset and return setattr( self, "fs", fs ) return fs
def test_modelA(test_dirs, muf, covf, mug, covg, ws, fs=16000): freq_posterior = [] mfcc_posterior = [] result = {} for dir_name in test_dirs: for file in glob(dir_name + '/*.wav'): print('Processing file: ', file) fs, f = wavfile.read(file) mfcc, freq = ft.features(f, fs) freq_posterior = scipy.stats.norm.logpdf( freq, muf[0], covf[0]) + np.log(0.5) - ( scipy.stats.norm.logpdf(freq, muf[1], covf[1]) - np.log(0.5)) mfcc = np.vstack(mfcc) tmp = [] for coef in mfcc: tmp.append( ft.logpdf_gmm(coef, ws[0], mug[0], covg[0]) + np.log(0.5) - ft.logpdf_gmm(coef, ws[1], mug[1], covg[1]) - np.log(0.5)) hard = np.mean(tmp) + freq_posterior soft = hard > 8.25 result[file] = (hard, soft) print(file, hard, soft) return result
def featureExtraction(): ''' Extract features and save :param recordings: :param varin: :return: ''' recordings = getRecordings(wav_path) for recording in recordings: wav_file = os.path.join(wav_path, recording + '.wav') energy_filename = os.path.join( feature_path, 'energy' + '_' + recording + '_' + str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy') spec_filename = os.path.join( feature_path, 'spec' + '_' + recording + '_' + str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy') for featurename in feature_set: print 'saving feature for ', recording, ', feature ', featurename feature_filename = os.path.join( feature_path, featurename + '_' + recording + '_' + str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy') varin['feature_select'] = featurename feature, energy, spec = features.features(wav_file, varin) np.save(feature_filename, feature) if featurename == feature_set[-1]: np.save(energy_filename, energy) np.save(spec_filename, spec)
def train(mymodel, myoptimizer, output_dir, epoch, train_dataloader, eval_dataloader, UsingGPU=True, min_f1score=0.8, maxtokeep=3, CVAfterEpoch=2, classnum=3): featuremodel = features.features() if UsingGPU: mymodel = mymodel.cuda() featuremodel = featuremodel.cuda() num_train_steps = int(epoch * len(train_dataloader.dataset) / train_dataloader.batch_size) logger.info("***** Do train *****") logger.info(" Num examples = %d", len(train_dataloader.dataset)) logger.info(" Batch size = %d", train_dataloader.batch_size) logger.info(" Num steps = %d", num_train_steps) global_step = 0 # 共迭代的次数 maxf1score = min_f1score for i in range(1, epoch + 1): logger.info("********epoch:{}********".format(i)) for p in myoptimizer.param_groups: p['lr'] = p['lr'] * 0.8 for batch in train_dataloader: global_step += 1 _features, labels = batch if UsingGPU: _features = _features.cuda() labels = labels.cuda() logist, loss = mymodel(featuremodel(_features), labels) loss.backward() # fgm.attack() # 在embedding上添加对抗扰动 # logist,loss_adv = mymodel(input_ids, segment_ids, input_mask, label_ids) # loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 # fgm.restore() # 恢复embedding参数 myoptimizer.step() myoptimizer.zero_grad() if global_step % 100 == 0: logger.info("step:{}, loss:{:.5f}".format( global_step, loss.data)) if global_step % 500 == 0 and i >= CVAfterEpoch: mymodel.eval() precision, recall, f1 = eval(mymodel, eval_dataloader, classnum, UsingGPU) mymodel.train() logger.info( "step:{}, precision:{:.5f}, recall:{:.5f}, f1:{:.5f}". format(global_step, precision, recall, f1)) if f1 > maxf1score: maxf1score = f1 model.save(mymodel, global_step, output_dir, MaxModelCount=maxtokeep)
def train(data, targets, filenames): targets = [val == "INFEC" for val in targets] # Set INFEC as positive val # Choose training mode options = ["Cross validation", "Build and test model"] res = ui.prompt(options=options) mode = options[int(res)] # Choose ML algorithm options = ["Support Vector Machine", "Random Forest", "Decision Tree Classifier", "KNN"] res = ui.prompt("Choose a ML algorithm:", options) switch = { 0: svm.SVC(C=100., random_state=0), 1: RandomForestClassifier(n_estimators=50, max_depth=None, random_state=0), 2: DecisionTreeClassifier(random_state=0), 3: KNeighborsClassifier() } clf = switch.get(int(res)) if mode == "Cross validation": model_evaluation(data, targets, clf) elif mode == "Build and test model": # Train model clf.fit(data, targets) # Get test dir while True: dirname = ui.prompt("Which directory are the test files in?") if os.path.isdir(dirname): break print("ERROR: Directory not found.") # Set up data/targets for test model print("\n************************************") print("* PREPARING MODEL FOR EVALUATION *") print("************************************") pageNames, y_true, filenames = pproc.process(dirname) y_true = [val == "INFEC" for val in y_true] # Set INFEC as positive val test_data = ft.features(pageNames) y_pred = clf.predict(test_data) save_filenames(y_true, y_pred, filenames) conf_matrix = skm.confusion_matrix(y_true, y_pred) accuracy = skm.accuracy_score(y_true, y_pred) precision = skm.precision_score(y_true, y_pred, average=None) recall = skm.recall_score(y_true, y_pred, average=None) f1 = skm.f1_score(y_true, y_pred, average=None) print("\n{}".format(conf_matrix)) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision[1])) print("Recall: {}".format(recall[1])) print("F1: {}".format(f1[1]))
def p_feature(f, trainingsteksten): """ Calculates the propability of a feature, based on a list of traintexts Args: String (feature), List of Tuples (String,String) (traintexts (text,category)) Returns: Float """ voorkomens = 0 for (tekst,cat) in trainingsteksten: if(features(f,tekst)): voorkomens += 1 return float(voorkomens)/len(trainingsteksten)
def scores(samples1, freq1, samples2, freq2, samples3, freq3, q1=None, q2=None, return_score=False): sentence = features(samples1, freq1).transpose() query1 = features(samples2, freq2).transpose() query2 = features(samples3, freq3).transpose() score_list1 = [] for pp in range(0, sentence.shape[0]-query1.shape[0], 5): score = 0 for n in range(query1.shape[0]): score += pearsonr(query1[n], sentence[pp+n])[0] score_list1.append(score/query1.shape[0]) score_list2 = [] for pp in range(0, sentence.shape[0]-query2.shape[0], 5): score = 0 for n in range(query2.shape[0]): score += pearsonr(query2[n], sentence[pp+n])[0] score_list2.append(score/query2.shape[0]) #print(len(score_list1), '\t', len(score_list2)) t1 = np.arange(len(score_list1))/100*5 t2 = np.arange(len(score_list2))/100*5 fig = plt.figure(figsize=(8,2)) plt.plot(t1, score_list1, t2, score_list2) if q1 != None and q2 != None: plt.legend([q1, q2]) else: plt.legend(['query1', 'query2']) plt.gca().set_xlabel('t') plt.gca().set_ylabel('scores') plt.gca().set_xlim(left=0) plt.gca().set_ylim(bottom=0) plt.tight_layout() if __name__ == '__main__': plt.savefig(path1.stem + '_score.pdf') elif return_score == True: return score_list1, score_list2 else: return fig
def run_secretory(filename): conn = sqlite3.connect('database.db') c = conn.cursor() parameter_file=open(filename+"_parameters.txt", 'w') seqID_list=[] result_file=open(filename+"_result.txt", 'w') result_file.write("Sequence_ID\tPrediction\n") records=SeqIO.parse(filename, "fasta") for record in records: i=0 hash_sequence=hashlib.md5(str(record.seq)).hexdigest() c.execute("SELECT * FROM secretory WHERE sequence='"+hash_sequence+"'") data=c.fetchone() if data is None: parameter_file.write(features(record.id, str(record.seq))+"\n") seqID_list.append(record.id) else: c.execute("UPDATE secretory SET access=access+1, time=CURRENT_TIMESTAMP WHERE sequence='"+hash_sequence+"'") conn.commit() c.execute("SELECT prediction FROM secretory WHERE sequence='"+hash_sequence+"'") data1=c.fetchone() result_file.write(str(record.id)+"\t"+data1[0]+"\n") parameter_file.close() paraFile=filename+"_parameters.txt" libsvm_secretory(paraFile) predicted = open(paraFile+".predict", "r") fasta_rec=SeqIO.index(filename, "fasta") print predicted i=0 for pred in predicted: print pred if int(pred)==1: pred='Secretory Protein' if int(pred)==0: pred='Non-Secretory Protein' result_file.write(seqID_list[i]+"\t"+pred+"\n") c.execute("INSERT INTO secretory VALUES ('"+hashlib.md5(str(fasta_rec[seqID_list[i]].seq)).hexdigest()+"', '"+pred+"', 0, CURRENT_TIMESTAMP)") i=i+1 conn.commit() predicted.close() result_file.close() if secretory_email!="": command = "echo 'Your SchistoProt Prediction Result is attached for job ID: '"+filename+"'\n\n\nKind regards,\n\nLutz Krause & Shihab Hasan\nBioinformatics Lab, QIMR Berghofer Medical Research Institute'"+" | EMAIL='Shihab Hasan <*****@*****.**>' mutt -a "+filename+"'_result.txt' -s 'SchistoProt Prediction Result' -- "+secretory_email subprocess.call(command, shell=(sys.platform!="Linux"))
def eval(path): result = getData(path) result, src, dst = features(result) result = result.reshape(1, -1) model, scaler, df = readData() result = scaler.transform(result) preds = model.predict(result) if preds[0] == 1: write(src, dst) else: with open('result.txt', 'w') as f: f.write('benign')
def locateChild(self, ctx, segments): if segments[0] == "" or segments[0] == "index.html": return self, [] if len(segments[0]) < 4 or len(segments[0]) > 20: return rend.NotFound elif segments[0] == "site": return dyn_site_root(), [] elif segments[0] == "signup": return signup(), [] elif segments[0] == "upgrade": return user_upgrade(), [] elif segments[0] == "free_account": return free_account(), [] elif segments[0] == "downloads": return downloads(), [] elif segments[0] == "qoop": return qoop(), segments[1:] if "reset_password" in segments[0]: request = inevow.IRequest(ctx) if request.args.has_key('username') and request.args.has_key( 'hash'): return reset_password(), [] else: return rend.NotFound if "quick_start_guide" in segments[0]: return quick_start_guide(), [] if "features" in segments[0]: return features(), [] if "developers" in segments[0]: return developers(), [] if "publish" in segments[0]: return user_publish("unknown"), segments[1:] if "community" in segments[0] and "feeds" in segments[1]: obj = user_homepage("") obj.username = "******" # have to hack this because the user_homepage ctor # lowercases it return obj, segments[1:] def act_check(count): if count: return user_homepage(segments[0]), segments[1:] else: if segments[1] == "img": return dyn_image_handler("noserve", self.app, self.log), segments[2:] else: return rend.NotFound d = self.app.api.users.check_exists('username', segments[0]) d.addCallback(act_check) return d
def locateChild(self, ctx, segments): if segments[0] == "" or segments[0] == "index.html": return self, [] if len(segments[0]) < 4 or len(segments[0]) > 20: return rend.NotFound elif segments[0] == "site": return dyn_site_root(), [] elif segments[0] == "signup": return signup(), [] elif segments[0] == "upgrade": return user_upgrade(), [] elif segments[0] == "free_account": return free_account(), [] elif segments[0] == "downloads": return downloads(), [] elif segments[0] == "qoop": return qoop(), segments[1:] if "reset_password" in segments[0]: request = inevow.IRequest(ctx) if request.args.has_key('username') and request.args.has_key('hash'): return reset_password(), [] else: return rend.NotFound if "quick_start_guide" in segments[0]: return quick_start_guide(), [] if "features" in segments[0]: return features(), [] if "developers" in segments[0]: return developers(), [] if "publish" in segments[0]: return user_publish("unknown"), segments[1:] if "community" in segments[0] and "feeds" in segments[1]: obj = user_homepage("") obj.username = "******" # have to hack this because the user_homepage ctor # lowercases it return obj, segments[1:] def act_check(count): if count: return user_homepage(segments[0]), segments[1:] else: if segments[1] == "img": return dyn_image_handler("noserve", self.app, self.log), segments[2:] else: return rend.NotFound d = self.app.api.users.check_exists('username', segments[0]) d.addCallback(act_check) return d
def showTree(self, tree, s='', depth=0): for _ in range(depth - 1): print('| ', end="") if depth > 0: print('-', end="") print(s, end="") print('-> ', end="") if type(tree) == int: # feuille print("class ", self.labelClass[tree]) else: # noeud interne feature, test = tree[0] feature = list(ft.features().keys())[feature] print(feature, " <= ", test, " ?") self.showTree(tree[1], s='Y', depth=depth + 1) self.showTree(tree[2], s='N', depth=depth + 1)
def vectorize(hands): featurelist = [features.features(h) for h in hands] (names, revnames) = feature_names(featurelist) ret = [] for feats in featurelist: row = [0] * len(names) for f in feats: idx = names[f] row[idx] += 1 ret.append(row) return (ret, names, revnames)
def eval(mymodel, eval_dataloader, classnum, UsingGPU): pre_labels = [] ground_true = [] for batch in eval_dataloader: _features, labels = batch featuremodel = features.features() if UsingGPU: _features = _features.cuda() labels = labels.cuda() featuremodel = featuremodel.cuda() pre_label, _ = mymodel.inference(featuremodel(_features)) pre_labels = pre_labels + pre_label ground_true = ground_true + list(labels.cpu().numpy()) precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support( ground_true, pre_labels, labels=range(classnum)) return sum(precision) / 3, sum(recall) / 3, sum(f1) / 3
def eval(mymodel, eval_dataloader, classnum, UsingGPU): pre_labels = [] ground_true = [] featuremodel = features.features() if UsingGPU: featuremodel = featuremodel.cuda() for batch in eval_dataloader: _features, labels = batch if UsingGPU: _features = _features.cuda() labels = labels.cuda() pre_label, _ = mymodel.inference(featuremodel(_features)) pre_labels = pre_labels + pre_label ground_true = ground_true + list(np.squeeze(labels.cpu().numpy(), 1)) precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support( ground_true, pre_labels, labels=range(classnum), average='micro') return precision, recall, f1
def read_dataset(): #X=[] #y=[] #simavg, weiavg, ROC, stoK, stoD, rsi, MACD, WR, ado, CCI, label = features.features("yahoostock.csv") X, y = features.features("yahoostock.csv") """X.append(simavg) X.append(weiavg) X.append(ROC) X.append(stoK) X.append(stoD) X.append(rsi) X.append(MACD) X.append(WR) X.append(ado) X.append(CCI) y.append(label) #print(CCI) X = np.array(X) X = X.transpose()""" #print(X[:,9]) X = X[27:5010] print(X.shape) X = pd.DataFrame(X) min_max_scaler = preprocessing.MinMaxScaler() np_scaled = min_max_scaler.fit_transform(X) X = pd.DataFrame(np_scaled) X = X.values #y = np.array(y) #y = y.transpose() y = y[27:5010] y = pd.DataFrame(y) #print(y) # Encode the dependent variable encoder = LabelEncoder() encoder.fit(y.values.ravel()) y = encoder.transform(y.values.ravel()) Y = one_hot_encode(y) #print(X.shape) return (X, Y)
def main(): # Create images & extract features (until user quits) doneExtracting = False while not doneExtracting: pageNames, targets, filenames = pproc.process(sys.argv[1]) data = ft.features(pageNames) # Create and evaluate model (until user quits) doneTraining = False while not doneTraining: tr.train(data, targets, filenames) options = ["Try another model", "Extract new features", "Quit"] res = options[int(ui.prompt(options=options))] if res == "Quit": doneTraining = True doneExtracting = True elif res == "Extract new features": doneTraining = True
def read_dataset(): X, y = features.features("yahoostock.csv") X = X[27:5010] X = pd.DataFrame(X) # normalize data min_max_scaler = preprocessing.MinMaxScaler() np_scaled = min_max_scaler.fit_transform(X) X = pd.DataFrame(np_scaled) X = X.values y = y[27:5010] y = pd.DataFrame(y) # Encode the dependent variable encoder = LabelEncoder() encoder.fit(y.values.ravel()) Y = encoder.transform(y.values.ravel()) return (X, Y)
def cluster2(): f = features.features() nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(f) distances, indices = nbrs.kneighbors(f) print "Distances and indices with kneighbors algorithm" print distances print indices K_MEANS = cluster.KMeans(n_clusters=5) K_MEANS.fit(f) X = np.zeros(len(f)) Y = np.zeros(len(f)) i = 0 for d in f: X[i] = d[0] Y[i] = d[1] i += 1 CENTERS = K_MEANS.cluster_centers_ pca = PCA(n_components=2) X_r = pca.fit(f).transform(f) X1 = [] Y1 = [] for i in range(len(X_r)): X1.append(X_r[i, 0]) Y1.append(X_r[i, 1]) K_MEANS.fit(X_r) NCENTERS = K_MEANS.cluster_centers_ FIG = plt.figure() AX = FIG.add_subplot(111) SCATTER = AX.scatter(X1, Y1, c=K_MEANS.labels_, s=50) for i, j in NCENTERS: AX.scatter(i, j, s=50, c='red', marker='+') AX.set_xlabel('x') AX.set_ylabel('y') plt.colorbar(SCATTER) plt.show()
def classify(text, trained_model, features1, categories,tr_texts): """ Classifies a text based on (not per se) a trained model, categories, features and the trainingstexts. Args: String (a text), List of Tuples (String,String), List of Strings, List of Strings Returns: String (prints propabilities per category) """ if (trained_model == []): trained_model = train(tr_texts, categories, features1) score_cat = {} for c in categories: noemer = 1 * p_cat(c,tr_texts) teller = 1 for f in features1: if(features(f,text)): noemer = noemer * trained_model[1][c][f] teller = teller * trained_model[0][f] + 0.0000001 score_cat[c] = float(noemer)/teller result = "false" #random.choice(categories) max_score = 0 for c in categories: if (score_cat[c]>max_score): result = c max_score = score_cat[c] return result
def read_dataset(): X, y = features.features("yahoostock.csv") X = X[27:5010] print(X.shape) X = pd.DataFrame(X) # print(X) #X.to_csv('data.csv') #min_max_scaler = preprocessing.MinMaxScaler() #np_scaled = min_max_scaler.fit_transform(X) #X = pd.DataFrame(np_scaled) X = X.values y = y[27:5010] y = pd.DataFrame(y) print(y.shape) # Encode the dependent variable encoder = LabelEncoder() encoder.fit(y.values.ravel()) Y = encoder.transform(y.values.ravel()) return (X, Y)
def featureset( key=None, promo=None, **kwargs ): """ Static factory method for creating or retrieving FeatureSet instances. """ # Local reference to features allfeatures = features() # Check to see if the specified key indicates a promotion if promo not in allfeatures: promo = "default" promofeatures = allfeatures[promo] if promo != "default" else None # Check request and lookup existing feature set, or create new empty set key = key or "uuid:%s" % str( uuid.uuid4() ) fs = db.get( db.Key.from_path( "FeatureSet", key ) ) or FeatureSet( key_name=key ) changed = not fs.is_saved() # Fill in promo on featureset if not hasattr( fs, "promo" ) or fs.promo != promo: changed = True setattr( fs, "promo", promo ) # Fill in any keyword arguments for k, v in kwargs.iteritems(): if hasattr( fs, k ) and getattr( fs, k ) == v: continue changed = True setattr( fs, k, v ) # Fill in FeatureSet instance from allfeatures for feature, groups in allfeatures["default"].iteritems(): # Override groups if in a promo which contains that feature if promofeatures and feature in promofeatures: groups = promofeatures[feature] # If groups is really just one value, the only choice is to set it if type(groups) != dict: if hasattr( fs, feature ) and getattr( fs, feature ) == groups: continue changed = True setattr( fs, feature, groups ) continue # If the FeatureSet already has this feature, make sure it's a legal value if hasattr( fs, feature ): oldval = getattr( fs, feature ) found = False for value, frequency in groups.iteritems(): if oldval == value: found = True break if found: continue # Randomly pick from the feature's groups based on defined frequencies stops = [] last = 0 for value, frequency in groups.iteritems(): last += frequency stops.append( ( last, value ) ) r = random.uniform( 0.0, last ) for i in range( len( stops ) ): if r < stops[i][0]: break # Set the feature on the feature set changed = True setattr( fs, feature, stops[i][1] ) # Save the FeatureSet if changed: fs.put() return fs
def lookup(self, instance): ret = defaultdict(int) for feature in instance: if feature in self.vocab: ret[self.vocab[feature]] += 1 else: ret[self.vocab["UNKNOWN"]] += 1 return ret if __name__ == "__main__": racism = DataSet("racism") racism_features = [] for tweet in tqdm(racism.data): racism_features.append(features(preprocess(tweet))) sexism = DataSet("sexism") sexism_features = [] for tweet in tqdm(sexism.data): sexism_features.append(features(preprocess(tweet))) neither = DataSet("neither") neither_features = [] for tweet in tqdm(neither.data): neither_features.append(features(preprocess(tweet))) vocab = Vocab() vocab.add(racism_features) vocab.add(sexism_features) vocab.add(neither_features)
def run(out_dir, config_fname, data_paths_fname, stats_list_fname, split_fname=None, check_if_file_exists=False, verbose=True): data_paths = util.read_yaml(data_paths_fname) config = util.read_yaml(config_fname) stats_key = config['stats_key'] outcome_stat_name = config['outcome_stat_name'] cohort_stat_name = config.get('cohort_stat_name', None) lab_lower_bound = config.get('lab_lower_bound', None) lab_upper_bound = config.get('lab_upper_bound', None) gap_days = config.get('gap_days', None) training_window_days = config['training_window_days'] buffer_window_days = config['buffer_window_days'] outcome_window_days = config['outcome_window_days'] time_period_days = config['time_period_days'] time_scale_days = config['time_scale_days'] use_just_labs = config['use_just_labs'] feature_loincs_fname = config['feature_loincs_fname'] add_age_sex = config['add_age_sex'] calc_gfr = config['calc_gfr'] regularizations = config.get('regularizations', [1]) lin_n_cv_iters = config.get('lin_n_cv_iters', -1) n_cv_iters = config.get('n_cv_iters', -1) progression = config['progression'] progression_lab_lower_bound = config.get('progression_lab_lower_bound', None) progression_lab_upper_bound = config.get('progression_lab_upper_bound', None) progression_gap_days = config.get('progression_gap_days', None) progression_stages = config.get('progression_stages', None) progression_init_stages = config.get('progression_init_stages', None) evaluate_nn = config.get('evaluate_nn', True) outcome_fname = out_dir + stats_key + '_' + outcome_stat_name + '.txt' if cohort_stat_name is None: cohort_fname = data_paths['demographics_fname'] else: cohort_fname = out_dir + stats_key + '_' + cohort_stat_name + '.txt' gfr_loincs = util.read_list_files('data/gfr_loincs.txt') training_data_fname = out_dir + stats_key + '_training_data.txt' feature_loincs = util.read_list_files(feature_loincs_fname) if use_just_labs == False: feature_diseases = [[ icd9 ] for icd9 in util.read_list_files('data/kidney_disease_mi_icd9s.txt')] feature_drugs = [ util.read_list_files('data/drug_class_' + dc.lower().replace( '-', '_').replace(',', '_').replace(' ', '_') + '_ndcs.txt') for dc in util.read_list_files( 'data/kidney_disease_drug_classes.txt') ] else: feature_diseases = [] feature_drugs = [] n_labs = len(feature_loincs) if add_age_sex: age_index = len(feature_loincs) + len(feature_diseases) + len( feature_drugs) gender_index = len(feature_loincs) + len(feature_diseases) + len( feature_drugs) + 1 else: age_index = None gender_index = None features_fname = out_dir + stats_key + '_features.h5' features_split_fname = out_dir + stats_key + '_features_split.h5' predict_fname = out_dir + stats_key + '_prediction_results.yaml' if evaluate_nn: nn_predict_fname = out_dir + stats_key + '_nn_prediction_results.yaml' else: nn_predict_fname = None if verbose: print "Loading data" db = util.Database(data_paths_fname) db.load_people() db.load_db(['loinc', 'loinc_vals', 'cpt', 'icd9_proc', 'icd9', 'ndc']) stats = util.read_yaml(stats_list_fname)[stats_key] if verbose: print "Calculating patient stats" data = ps.patient_stats(db, stats, stats_key, out_dir, stat_indices=None, verbose=verbose, check_if_file_exists=check_if_file_exists, save_files=True) if verbose: print "Building training data" outcome_data = btd.build_outcome_data(out_dir, outcome_fname) cohort_data = btd.setup(data_paths['demographics_fname'], outcome_fname, cohort_fname) # calc_gfr = True here because it's required to define the condition training_data = btd.build_training_data(db, cohort_data, gfr_loincs, lab_lower_bound, lab_upper_bound, \ training_window_days, buffer_window_days, outcome_window_days, time_period_days, time_scale_days, gap_days, calc_gfr=True, verbose=verbose, \ progression=progression, progression_lab_lower_bound=progression_lab_lower_bound, progression_lab_upper_bound=progression_lab_upper_bound, \ progression_gap_days=progression_gap_days, progression_init_stages=progression_init_stages, progression_stages=progression_stages) training_data.to_csv(training_data_fname, index=False, sep='\t') if verbose: print "Building features" features.features(db, training_data, feature_loincs, feature_diseases, feature_drugs, time_scale_days, features_fname, calc_gfr, verbose, add_age_sex) if split_fname is None: split_fname = out_dir + stats_key + '_split.txt' features.train_validation_test_split(training_data['person'].unique(), split_fname, verbose=verbose) features.split(features_fname, features_split_fname, split_fname, verbose) if verbose: print "Training, validating and testing models" predict.predict(features_split_fname, lin_n_cv_iters, n_cv_iters, regularizations, n_labs, age_index, gender_index, predict_fname, nn_predict_fname)
#Importing modules import pandas as pd import features as f #Creating feature sets f2 = f.features() Dataframe_Easy = f2.create_dataframe() Dataframe_Medium = f2.create_dataframe() Dataframe_Tough = f2.create_dataframe() #Easy value1 = [] for i in range(0, 33): value1.append(1) df1 = pd.DataFrame({'Class': value1}) Dataframe_Easy = Dataframe_Easy.join(df1) #Medium value2 = [] for i in range(0, 33): value2.append(2) df2 = pd.DataFrame({'Class': value2}) Dataframe_Medium = Dataframe_Medium.join(df2) #Hard value3 = [] for i in range(0, 33): value3.append(3) df3 = pd.DataFrame({'Class': value3}) Dataframe_Tough = Dataframe_Tough.join(df3)
def extractFeatures(img): img = ImageOps.fit(Image.fromarray(img), (32, 32)) hMats, wFilters = filterBuilder.buildFilters() return features.features(np.asarray(img),hMats,wFilters)
import pickle from nltk import NaiveBayesClassifier from features import features f1 = open("male.txt") f2 = open("female.txt") trainer = NaiveBayesClassifier.train namelist = ([(name, 'male') for name in f1] + [(name, 'female') for name in f2]) train = namelist[:5000] classifier = trainer([(features(n), g) for (n, g) in train]) with open('classifier.pickle', 'wb') as outfile: pickle.dump(classifier, outfile) outfile.close()
test_loader = Data.DataLoader( # 从数据库中每次抽出batch size个样本 dataset=test_dataset, batch_size=batchsize, shuffle=False, num_workers=2, ) logging.basicConfig(level=logging.INFO, # filename=save_dir + '/log.txt', filemode='w', format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') logger = logging.getLogger(__name__) mymodel =model.model(seqlenth,featuresize=4,seqembedding=3,dropout=0.5) featuremodel=features.features() if UsingGPU: mymodel=mymodel.cuda() featuremodel=featuremodel.cuda() estimation = np.zeros((testingImagNumber, n_class)) for i in range(K): # logger.info('*****{}***** fold test start.'.format(i)) model_list = os.listdir(save_dir+'/Fold_{}'.format(i)) model_list.sort() # print(save_dir+'/baseline_{}/'.format(i)+model_list[-1]) temp_model = torch.load(save_dir+'/Fold_{}/'.format(i)+model_list[-1]) mymodel.load_state_dict(temp_model['model_state_dict']) mymodel.eval() out = torch.FloatTensor() pre_labels=[]
def __init__(self): self.obj = features() step = 1 learning_rate = 0.001 self.STEPS = 47 n_input = 60 # data input n_hidden = 64 # hidden layer num of features n_classes = 32 # total classes output lstm self.X_in = tf.placeholder(tf.float32, [None, 2, self.STEPS, n_input]) self.y = tf.placeholder(tf.float32, [None]) self.z = tf.placeholder(tf.int32, [None, 2]) with tf.variable_scope('fc') as scope: weights = { # (60 * 300) 'in': tf.Variable(tf.random_normal([n_input, n_hidden]), ), } biases = { 'in': tf.Variable(tf.constant(0.1, shape=[ n_hidden, ])), 'mini': tf.constant(0.00001, shape=[ 1, ]) } scope.reuse_variables() X1 = self.X_in[:, 0] X2 = self.X_in[:, 1] seq1_len = self.z[:, 0] seq2_len = self.z[:, 1] X1 = tf.reshape(X1, [-1, n_input]) X1 = tf.matmul(X1, weights['in']) + biases['in'] X1 = tf.reshape(X1, [-1, self.STEPS, n_hidden]) X2 = tf.reshape(X2, [-1, n_input]) X2 = tf.matmul(X2, weights['in']) + biases['in'] X2 = tf.reshape(X2, [-1, self.STEPS, n_hidden]) X1 = tf.nn.relu(X1) X2 = tf.nn.relu(X2) size = tf.shape(seq1_len)[0] with tf.name_scope("layer1"): with tf.variable_scope("rnn_1"): lstm_cell = tf.contrib.rnn.BasicLSTMCell(32) init_state = lstm_cell.zero_state(size, dtype=tf.float32) outputs1, _1 = tf.nn.dynamic_rnn(lstm_cell, X1, sequence_length=seq1_len, dtype=tf.float32, initial_state=init_state) outputs2, _2 = tf.nn.dynamic_rnn(lstm_cell, X2, sequence_length=seq2_len, dtype=tf.float32, initial_state=init_state) outputs1 = tf.nn.relu(outputs1) outputs2 = tf.nn.relu(outputs2) with tf.name_scope("layer2"): with tf.variable_scope("rnn_2"): lstm_cell_b = tf.contrib.rnn.BasicLSTMCell(32) init_state_b = lstm_cell_b.zero_state(size, dtype=tf.float32) __, states1 = tf.nn.dynamic_rnn(lstm_cell_b, outputs1, sequence_length=seq1_len, dtype=tf.float32, initial_state=None) __, states2 = tf.nn.dynamic_rnn(lstm_cell_b, outputs2, sequence_length=seq2_len, dtype=tf.float32, initial_state=None) norm1 = tf.sqrt(tf.reduce_sum(tf.square(states1[1]), axis=1)) norm2 = tf.sqrt(tf.reduce_sum(tf.square(states2[1]), axis=1)) dot = tf.reduce_sum(tf.multiply(states1[1], states2[1]), axis=1) final = dot / tf.add(tf.multiply(norm1, norm2), biases['mini']) with tf.name_scope("prediction"): final = final * 0.5 + 0.5 self.pred = tf.reshape(final, [-1]) with tf.name_scope("cost"): tv = tf.trainable_variables() # l2_cost = 0.00001 * tf.reduce_sum([tf.nn.l2_loss(v) for v in tv]) self.cost = -tf.reduce_mean(self.y * tf.log(self.pred + 0.00001) + (1 - self.y) * tf.log(1 - self.pred + 0.00001)) self.train_op = tf.train.AdamOptimizer(learning_rate).minimize( self.cost) init_op = tf.global_variables_initializer() # merged_summary = tf.summary.merge_all() self.saver = tf.train.Saver(tf.global_variables()) # config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.6 self.sess = tf.Session() self.sess.run(init_op) ckpt = tf.train.get_checkpoint_state( '/Users/ivanfzh/Desktop/graduation_proj/fzh/save_w2v/') self.saver.restore(self.sess, ckpt.model_checkpoint_path)
def buildbow(word_count_threshold, content): instance = features(word_count_threshold) word_counts, wordtoix = instance.extractwords(content) return word_counts, wordtoix
json.dump(word_counts, file(os.path.join(filepath,dictname), 'w')) json.dump(wordtoix, file(os.path.join(filepath,dict2idx), 'w')) N = len(word_counts) # process the length of each class for i in range(1, len(nums)): nums[i] = nums[i-1] + nums[i] cid = 0 # class output = np.zeros((nums[len(nums)-1], N+1)) for url in urllists: urls = geturls(url) print urls content = getdata(urls, depth) instance = features(word_count_threshold) feats = instance.bagofwords(content, word_counts, wordtoix) print feats.shape currlen = len(content) b = np.zeros((currlen,N+1)) print b[:, :-1].shape b[:,0:N] = feats b[:,-1] = cid output[nums[cid]:nums[cid+1],:] = b cid = cid + 1 np.savetxt(os.path.join(filepath,filename), output, delimiter=',') # X is an array #output = np.loadtxt(os.path.join(filepath,filename), delimiter=',', unpack=True) #print output.shape #output = output.T
sql = "SELECT `ID` FROM `{0}`.`{1}` WHERE ".format( parameter_info["database_source"], parameter_info["table_captured"]) where_info = "" types = str(parameter_info["type"]).split(',') for index in range(0, len(types)): where_info += "`TYPE`='{0}'".format(types[index]) if index is not len(types) - 1: where_info += " OR " sql += where_info + ";" print(sql) cursor_source.execute(sql) IDs = list() temp_result = cursor_source.fetchall() cursor_source.close() db_source.close() for item in temp_result: IDs.append(item[0]) print("Get ID : {0}".format(IDs[len(IDs) - 1])) # with ProcessPoolExecutor(int(parameter_info["threads"])) as process_executor: # for source_id in IDs: # process_executor.submit(features.features, parameter_info, int(source_id)) for source_id in IDs: features.features(parameter_info, int(source_id)) print("All finished.")
# -*- coding:utf-8 -*- from sklearn import svm from sklearn import linear_model import numpy as np import pickle import jieba import random from features import features from sklearn import metrics import json LENGTH = 60 DATA_PATH='data.txt' obj = features() STATISTICAL_LENGTH = 8 def split(sent): return list(jieba.cut(sent)) def load_w2v(path): f1 = open(path, 'r') y_label = [] x_set = [] line_num = 0 for line in f1.readlines(): li = line.strip().split('\t') if li[0] == '0': y_label.append(0)