def train(self, dataFile): '''Trains the Naive Bayes Sentiment Classifier.''' dr = DataReader(dataFile) label, data = dr.next() while(label): try: if label not in self.word_counts: self.word_counts[label] = {} if label in self.docs: self.docs[label] += 1 else: self.docs[label] = 1 self.total_docs += 1 for i in range(len(data)): if data[i] in self.word_counts[label]: self.word_counts[label][data[i]] += 1 else: self.word_counts[label][data[i]] = 1 label, data = dr.next() except StopIteration: # Calculate the total number of words / label for label, label_words in self.word_counts.items(): self.word_sums[label] = 0 for word, word_count in label_words.items(): self.word_sums[label] += word_count self.save(dataFile+".pickle") return
def constructPredictionWithOutput(classifier,classifierIndex,xTest, testBatchIndex): print "Predicting with classifier {}".format(classifierIndex) yPred = classifier.predict_proba(xTest) print "Writing to csv..." outputFileName="data\\ensembleTraining\\out"+str(classifierIndex)+".csv" dataReader.writePredToCsv(yPred,testBatchIndex,outputFileName=outputFileName)
def constructTrainingData(trainDataSize): #training data trainData = dataReader.getTrainData(trainDataSize) trainData = trainData.append(dataReader.getSuffixDataFrame()) # feature engineering trainData = regularFeatExtr.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = regularFeatExtr.getRegularFeatures(trainData, True) return xTrain,yTrain
def train(self, dataFile): '''Trains the Naive Bayes Sentiment Classifier.''' dr = DataReader(dataFile) label, data = dr.next() while(label): try: if label not in self.word_counts: self.word_counts[label] = {} if label in self.docs: self.docs[label] += 1 else: self.docs[label] = 1 self.total_docs += 1 # Add counts for individual words for i in range(len(data)): if data[i] in self.word_counts[label]: self.word_counts[label][data[i]] += 1 else: self.word_counts[label][data[i]] = 1 # Add counts for bigrams to the same dictionary for i in range(len(data)/2): #implementing bigrams instead of unigrams j = 2*i bigram = data[j] + " " + data[j+1] if bigram in self.word_counts[label]: self.word_counts[label][bigram] += 1 if data[j].isupper(): self.word_counts[label][bigram] += .5 # counts a word an extra half time if it is all caps if data[j+1].isupper(): self.word_counts[label][bigram] += .5 # counts a word an extra half time if it is all caps else: self.word_counts[label][bigram] = 1 label, data = dr.next() except StopIteration: # Calculate the total number of words / label for label, label_words in self.word_counts.items(): self.word_sums[label] = 0 for word, word_count in label_words.items(): self.word_sums[label] += word_count self.save(dataFile+".pickle") return
def make_data_from_file(feature_type,input_folder,start,end): """ Read data from original feature files directly feature_type should be a number: 0 - chi1, 1 - chi2, 2 - hbonds, 3 - rmsd """ data = [] # data = numpy.array([]) feature_len = 0 all_data = dr.preprocess(start, end, input_folder) for f in all_data[feature_type]: f = np.asarray(f) data.append((f,f)) if feature_len == 0: feature_len = len(f) all_data = [] # release the memory X = np.asarray(data) if feature_type == 2: # special treatment for hbonds [0,3] X_std = X/3.1 else: X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) data = [(np.reshape(x, (len(x), 1)), np.reshape(y, (len(y), 1))) for x, y in X_std] # pickle.dump(data,open("/output/"+features[feature_type]+"array","wb")) return data, feature_len
def getRegularFeatures(data, isTrainData): data = performRegularFeatureEngineering(data, isTrainData) # splitting data into X and Y if 'Category' in data.columns.values: yData = data.Category data = data.drop(['Category'], 1) else: yData = [] xData = data.values dataReader.serializeObject(data.columns.values,"data\\misc\\columns.csv") print "Features used {}".format(data.columns.values) return xData,yData
def splitTestDataIntoChunks(): testData = dataReader.getWholeTestData() miniDataFrames = np.array_split(testData, numberOfPartitions) for i in range(numberOfPartitions): outputFileName = 'data\\miniTestData\\miniDataFrame'+str(i)+'.csv' miniDataFrames[i].to_csv(outputFileName,index=False)
def calculateLearningCurve(): classifier = classifierSelector.constructGradientBoostingClassifier() trainData = dataReader.getTrainData() # feature engineering trainData = featureExtractor.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True) trainSizes = np.linspace(100000,500000,5,dtype=int) plot_learning_curve(classifier,xTrain,yTrain,trainSizes)
def test(self, dataName, logFilename): ''' Tests against dataName and logs to logFilename. ''' dr = DataReader(dataName) correct = 0 total = 0 found_counts = {} actual_counts = {} label, data = dr.next() log = open(logFilename, 'w') while( label,data ): try: if label in actual_counts: actual_counts[label] += 1 else: actual_counts[label] = 1 total += 1 string = "" for i in data: string += i + " " bayes_label, bayes_prob = self.classify(string) # print "Result:" + bayes_label + " Correct Label: " + label # log.write("Result:" + bayes_label + " Correct Label: " + label+ "\n") if bayes_label == label: if bayes_label in found_counts: found_counts[bayes_label] += 1 else: found_counts[bayes_label] = 1 correct += 1 label, data = dr.next() except StopIteration: for k, v in actual_counts.items(): if k in found_counts: log.write(k + " " + str(float(found_counts[k])/actual_counts[k]) + "\n") else: log.write(k + " 0\n") log.close() return float(correct)/total
def createEnsembleBasedODifferentTrainingSets(): # constructing the limits margins = np.linspace(0,878000,5,dtype=int) marginTuples=[] for i in range(len(margins)-1): marginTuples.append((margins[i],margins[i+1])) # training classifiers allClassifiers = Parallel(n_jobs=-1)(delayed(mainScript.trainClassifierOnTrainingData)(margins=marginTuple) for marginTuple in marginTuples) # Predicting on batch test data partitionNumber = utils.numberOfPartitions for batchIndex in range(partitionNumber): print "Predicting batch {}".format(batchIndex) miniTestData = dataReader.getSerializedMiniTestData(batchIndex) xTest,yTest = mainScript.constructTestData(miniTestData) for classifierIndex,currentClassifier in enumerate(allClassifiers): constructPredictionWithOutput(currentClassifier,classifierIndex,xTest,batchIndex) # post process print "Post processing everything..." outputFileNames = ["data\\ensembleTraining\\out"+str(index)+".csv" for index in range(len(allClassifiers))] for outputFileName in outputFileNames: dataReader.postProcessCsv(outputFileName=outputFileName) #Merging everything together print "Merging all solutions...." fileRegex = "data\\ensembleTraining\\*.csv" createEnsembleBasedOnExitingPredictions(fileRegex=fileRegex)
def trainClassifierOnTrainingDataReturnAll(numberOfTrainingExamples = -1): trainData = dataReader.getTrainData(numberOfTrainingExamples) # feature engineering trainData = featureExtractor.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True) # classifier training classifier = classifierSelector.trainClassifier(xTrain, yTrain) return classifier, xTrain, yTrain
def calculateValidationCurve(): classifier = classifierSelector.constructGradientBoostingClassifier() numberOfTrainData = 50000 trainData = dataReader.getTrainData(numberOfTrainData) # feature engineering trainData = featureExtractor.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True) paramRange = [0.1,0.13,0.16] plot_validation_curve(classifier,xTrain,yTrain,"learning_rate",paramRange)
def load_image(self,img_num,load_step): dir_start = self.init_dirs[load_step] dir_num = str(dir_start + img_num) dir_num_dark = str(self.dark_dirs[load_step]) im_dir = os.path.join(self.data_dir,dir_num,'ff') im_file = os.listdir(im_dir) assert len(im_file) == 1 im_path = os.path.join(im_dir,im_file[0]) dark_dir = os.path.join(self.data_dir,dir_num_dark,'ff') dark_file = os.listdir(dark_dir) assert len(dark_file) == 1 dark_path = os.path.join(dark_dir,dark_file[0]) dark_image = DataReader.ge2_reader_image(dark_path,0) if len(dark_image.shape) > 1: dark_image = np.mean(dark_image, axis=0) ring_image = DataReader.ge2_reader_image(im_path,0) img = ring_image - dark_image return img
def trainClassifierOnTrainingData(trainData=None, numberOfTrainingExamples = -1, margins=None): if trainData is None: trainData = dataReader.getTrainData(numberOfTrainingExamples,margins) # feature engineering trainData = regularFeatExtr.convertTargetFeatureToNumeric(trainData) xTrain, yTrain = regularFeatExtr.getRegularFeatures(trainData, True) # classifier training classifier = classifierSelector.trainClassifier(xTrain, yTrain) return classifier
def predictForSubmission(): startTime = time.time() allAlgorithmStartTime = startTime numberOfTrainingExamples = -1 classifier = trainClassifierOnTrainingData(numberOfTrainingExamples) print "Beginning to load test data..." partitionNumber = utils.numberOfPartitions for index in range(partitionNumber): miniTestData = dataReader.getSerializedMiniTestData(index) xTest,yTest = constructTestData(miniTestData) print "Predicting..." yPred = classifier.predict_proba(xTest) dataReader.writePredToCsv(yPred,index) print "Post processing..." dataReader.postProcessCsv() print("Total run time:{}".format(time.time() - allAlgorithmStartTime))
def plotFeatureImportance(classifier): featureNames = dataReader.deserializeObject("data\\misc\\columns.csv") feature_importance = classifier.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.subplot(1, 2, 2) plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos,featureNames[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show()
def getDifferentTrainAndTestData(trainDataSize, testDataSize): data = dataReader.getWholeTrainingData() if trainDataSize+testDataSize > data.shape[0]: # request more rows than the DF has print "Getting different train & test data with possible duplicates" trainData = data.sample(trainDataSize) testData = data.sample(testDataSize) else: print "Getting totally different train & test data" indexes = np.arange(data.shape[0]) #0->873k random.shuffle(indexes) # works in-place trainData = data.ix[indexes[0:trainDataSize]] testData = data.ix[indexes[trainDataSize+1:trainDataSize+1+testDataSize]] return trainData,testData
def testParameterPerformance(): startTime = time.time() allAlgorithmStartTime = startTime # define sizes trainDataSize = 10000 testDataSize = 100000 trainData,testData = utils.getDifferentTrainAndTestData(trainDataSize,testDataSize) #in order to assure that we have members form each class present testData = testData.append(dataReader.getSuffixDataFrame()) classifier = trainClassifierOnTrainingData(trainData=trainData) xTest,yTest = constructTestData(testData) yPred = classifier.predict(xTest) validator.performValidation(yPred, yTest) print("Total run time:{} s".format((time.time() - allAlgorithmStartTime)))
while iteration * self.hparams.batch_size < self.hparams.training_size: train_cost, train_accuracy = self.sess.run( [self.train_loss, self.accuracy]) print("iterations: [%2d] time: %4.4f, loss: %.8f, accuracy: %.8f" % (iteration, time.time() - start_time, np.mean(train_cost), train_accuracy)) coord.request_stop() coord.join(threads) if __name__ == '__main__': dataset_name = "cnn" dataset_dir = "../data_2" dr = DataReader() hparams = tf.flags hparams.DEFINE_integer("training_size", 381000, "total number of training samples") #381000 hparams.DEFINE_integer("number_of_epochs", 200, "Epoch to train [25]") hparams.DEFINE_integer("vocab_size", 10000, "The size of vocabulary [10000]") hparams.DEFINE_integer("batch_size", 32, "The size of batch images [32]") hparams.DEFINE_integer("depth", 1, "Depth [1]") hparams.DEFINE_integer("max_nsteps", 1000, "Max number of steps [1000]") hparams.DEFINE_integer("number_of_hidden_units", 512, "The size of hidden layers") hparams.DEFINE_float("learning_rate", 5e-5, "Learning rate [0.00005]") hparams.DEFINE_float("momentum", 0.9, "Momentum of RMSProp [0.9]") hparams.DEFINE_float("keep_prob", 0.7, "keep_prob [0.5]")
metrics = {} metrics["muc"] = (mr, mp, mf) metrics["b3"] = (br, bp, bf) metrics["ceaf"] = (cr, cp, cf) return metrics def print_performance(m): mp, mr, mf = m["muc"] print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = m["b3"] print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = m["ceaf"] print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) if __name__ == "__main__": #network_file = "./model/pretrain/network_model_pretrain.best" network_file = "./model/pretrain/network_model_pretrain.top.best" #network_file = "./model/model.pkl" print >> sys.stderr, "Read model from ./model/model.pkl" network_model = torch.load(network_file) #dev_docs = DataReader.DataGnerater("dev") dev_docs = DataReader.DataGnerater("test") best_thres = 0.4 best_thres = evaluate(network_model, dev_docs, best_thres)
def _get_data_loader(self, data_conf): loader = DataReader(data_conf, self.logger, self.n_fold) return loader
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import nltk import DataReader import Wordsmith import json entries = nltk.corpus.cmudict.entries() dictionary = dict(entries) bucket = Wordsmith.bucket wordlist = dictionary.keys() wordlist.extend(DataReader.collocationEntries()) def seed(): count = 0 for w in wordlist: bucket.add(w) count+=1 if count % 20000 == 0: print str((count/20000)*10) + " percent done loading" f = open('./nltk_data/bucketstore', 'a') f.write(json.dumps(bucket.buckets))
print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) manager = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(manager, best_network_model) reduced = "" if args.reduced == 1: reduced = "_reduced" #dev_docs = DataReader.DataGnerater("dev"+reduced) test_docs = DataReader.DataGnerater("test" + reduced) metric = performance(test_docs, worker, manager) print "Ave", metric["average"] #network_file = "./model/network_model_pretrain.top.best" #network_model = torch.load(network_file) #ana_network_file = "./model/network_model_pretrain.top.best" #ana_network_model = torch.load(ana_network_file) #reduced="" #if args.reduced == 1: # reduced="_reduced" #metric = performance(test_docs,network_model,ana_network_model)
def main(): solar_data = [] solar_data += DataReader.get_daily_totals_for_file('Data/2005.csv') solar_data += DataReader.get_daily_totals_for_file('Data/2006.csv') solar_data += DataReader.get_daily_totals_for_file('Data/2007.csv') solar_data += DataReader.get_daily_totals_for_file('Data/2008.csv') solar_data += DataReader.get_daily_totals_for_file('Data/2009.csv') print("Imported Data") print("Running Simulation:") num_of_sims = 0 with open('Output/rs.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerow(['Smart Agents Wealth', 'Smart Agents Trades', 'Smart Agents Wealth (all)', 'Smart Agents Trades (all)','Controlled Agents Wealth', 'Controlled Agents Trades', 'Controlled Agents Wealth (all)', 'Controlled Agents Trades (all)', 'All Agents Wealth', 'All Agents Trades', 'All Agents Wealth (all)', 'All Agents Trades (all)', 'Smart Agents Price Correlation', 'Controlled Agents Price Correlation', 'All Agents Price Correlation']) for i in range(num_of_sims): print("Simulation:" + str(i + 1)) market = Market(100, 10) for i in range(len(solar_data)): weather = solar_data[i] market.update(float(weather)/1000.0) price = market.price_history[-1] supply = market.asks[-1] demand = market.bids[-1] smart_agents = [agent for agent in market.agents if agent.use_brain == True] controlled_agents = market.agents[:10] all_agents = market.agents smart_wealth = sum([agent.wealth for agent in smart_agents if agent.wealth > 0])/len(smart_agents) smart_no_trades = sum([agent.no_trades for agent in smart_agents if agent.wealth > 0])/len(smart_agents) smart_wealth_a = sum([agent.wealth for agent in smart_agents])/len(smart_agents) smart_no_trades_a = sum([agent.no_trades for agent in smart_agents])/len(smart_agents) controlled_wealth = sum([agent.wealth for agent in controlled_agents if agent.wealth > 0])/len(controlled_agents) controlled_no_trades = sum([agent.no_trades for agent in controlled_agents if agent.wealth > 0])/len(controlled_agents) controlled_wealth_a = sum([agent.wealth for agent in controlled_agents])/len(controlled_agents) controlled_no_trades_a = sum([agent.no_trades for agent in controlled_agents])/len(controlled_agents) all_wealth = sum([agent.wealth for agent in all_agents if agent.wealth > 0])/len(all_agents) all_no_trades = sum([agent.no_trades for agent in all_agents if agent.wealth > 0])/len(all_agents) all_wealth_a = sum([agent.wealth for agent in all_agents])/len(all_agents) all_no_trades_a = sum([agent.no_trades for agent in all_agents])/len(all_agents) smart_price_history = [agent.price_history for agent in smart_agents] smart_price_history = [sum(col) / float(len(col)) for col in zip(*smart_price_history)] smart_prediction_accuracy = str(numpy.corrcoef(smart_price_history,market.price_history)[0][1]) controlled_price_history = [agent.price_history for agent in controlled_agents] controlled_price_history = [sum(col) / float(len(col)) for col in zip(*controlled_price_history)] controlled_prediction_accuracy = str(numpy.corrcoef(controlled_price_history,market.price_history)[0][1]) all_price_history = [agent.price_history for agent in all_agents] all_price_history = [sum(col) / float(len(col)) for col in zip(*all_price_history)] all_prediction_accuracy = str(numpy.corrcoef(all_price_history,market.price_history)[0][1]) writer.writerow([smart_wealth, smart_no_trades, smart_wealth_a, smart_no_trades_a, controlled_wealth, controlled_no_trades, controlled_wealth_a, controlled_no_trades_a,all_wealth, all_no_trades, all_wealth_a, all_no_trades_a,smart_prediction_accuracy,controlled_prediction_accuracy,all_prediction_accuracy]) print("Creating Graphs:") market = Market(100, 10) for i in range(len(solar_data)): if int(i % (len(solar_data) / 100)) == 0: print(str(int(i / len(solar_data) * 100)) + "%") weather = solar_data[i] market.update(float(weather)/1000.0) price = market.price_history[-1] supply = market.asks[-1] demand = market.bids[-1] solar_plot = pyplot.figure() a = solar_plot.add_subplot(111) a.plot(range(len(solar_data)), solar_data, label='Solar Radiance') a.legend() a.set_ylabel('kWh per m^2') a.set_xlabel('Day') solar_plot.savefig('Output/solar_radiance.png') aggregate_supply_demand_plot = pyplot.figure() b = aggregate_supply_demand_plot.add_subplot(111) b.plot(range(len(solar_data)), market.bids, label='Aggregate Demand') b.plot(range(len(solar_data)), market.asks, label='Aggregate Supply') b.legend() b.set_ylabel('Quantity') b.set_xlabel('Day') aggregate_supply_demand_plot.savefig('Output/supply_demand_history.png') price_history_plot = pyplot.figure() c = price_history_plot.add_subplot(111) c.plot(range(len(solar_data)), market.price_history, label='Average Trading Price') c.legend() c.set_ylabel('Price') c.set_xlabel('Day') c.set_ylim(12,22) price_history_plot.savefig('Output/price_history.png') #Plot average price expectation smart_agents = [agent for agent in market.agents if agent.use_brain == True] smart_price_history = [agent.price_history for agent in smart_agents] smart_price_history = [sum(col) / float(len(col)) for col in zip(*smart_price_history)] smart_price_prediction = pyplot.figure() d = smart_price_prediction.add_subplot(111) d.plot(range(len(solar_data)), market.price_history, label='Average Trading Price') d.plot(range(len(solar_data)), smart_price_history, label='Smart Agent Predicted Price') d.legend() d.set_ylabel('Price') d.set_xlabel('Day') d.set_ylim(12,22) smart_price_prediction.savefig('Output/smart_price_history.png') controlled_agents = market.agents[:10] controlled_price_history = [agent.price_history for agent in controlled_agents] controlled_price_history = [sum(col) / float(len(col)) for col in zip(*controlled_price_history)] controlled_price_prediction = pyplot.figure() e = controlled_price_prediction.add_subplot(111) e.axis('equal') e.scatter(market.price_history, controlled_price_history, label='Control Group Predicted Price', color = 'blue', alpha = 0.5, s=10) e.scatter(market.price_history, smart_price_history, label='Machine Learning Predicted Price', color = 'green', alpha = 0.5, s=10) e.legend() e.set_ylabel('Predicted Price') e.set_xlabel('Actual Price') e.set_ylim(12,20) e.set_xlim(12,20) controlled_price_prediction.savefig('Output/controlled_price_history.png') all_agents = market.agents all_price_history = [agent.price_history for agent in all_agents] all_price_history = [sum(col) / float(len(col)) for col in zip(*all_price_history)] all_price_prediction = pyplot.figure() f = all_price_prediction.add_subplot(111) f.plot(range(len(solar_data)), market.price_history, label='Average Trading Price') f.plot(range(len(solar_data)), all_price_history, label='All Group Predicted Price') f.legend() f.set_ylabel('Price') f.set_xlabel('Day') f.set_ylim(12,22) all_price_prediction.savefig('Output/all_price_history.png') #Sample Supply Demand Curve for agent in market.agents: agent.day_begin(5.0, market) buyers = [agent.price for agent in [agent for agent in market.agents if agent.demand > 0]] sellers = [agent.price for agent in [agent for agent in market.agents if agent.supply > 0]] buyers.sort(reverse=True) sellers.sort() while len(sellers) < len(buyers): buyers.pop() while len(buyers) < len(sellers): sellers.pop() supply_demand = pyplot.figure() g = supply_demand.add_subplot(111) g.plot(range(len(sellers)), sellers, label='Supply') g.plot(range(len(buyers)), buyers, label='Demand') g.legend() g.set_ylabel('Price') g.set_xlabel('Quantity') g.set_ylim(12,22) supply_demand.savefig('Output/supply_demand.png') print("Done")
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/pretrain/network_model_pretrain.best" print >> sys.stderr, "Read model from ./model/model.pkl" best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" network_model = network.Network(pair_feature_dimention, mention_feature_dimention, word_embedding_dimention, span_dimention, 1000, embedding_size, embedding_dimention, embedding_matrix).cuda() print >> sys.stderr, "save model ..." #torch.save(network_model,network_file) net_copy(network_model, best_network_model) reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." train_docs = DataReader.DataGnerater("train" + reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs = DataReader.DataGnerater("dev" + reduced) test_docs = DataReader.DataGnerater("test" + reduced) l2_lambda = 1e-6 lr = 0.0002 dropout_rate = 0.5 shuffle = True times = 0 best_thres = 0.5 model_save_dir = "./model/pretrain/" last_cost = 0.0 all_best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } for echo in range(100): start_time = timeit.default_timer() print "Pretrain Epoch:", echo #if echo == 100: # lr = lr/2.0 #if echo == 150: # lr = lr/2.0 #optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, network_model.parameters()), lr=lr, weight_decay=l2_lambda) #optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda) optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps=1e-5, weight_decay=l2_lambda) pair_cost_this_turn = 0.0 ana_cost_this_turn = 0.0 pair_nums = 0 ana_nums = 0 pos_num = 0 neg_num = 0 inside_time = 0.0 for data in train_docs.train_generater(shuffle=shuffle, top=True): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target,top_x = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) reindex = autograd.Variable( torch.from_numpy(top_x["score_index"]).type( torch.cuda.LongTensor)) start_index = autograd.Variable( torch.from_numpy(top_x["starts"]).type(torch.cuda.LongTensor)) end_index = autograd.Variable( torch.from_numpy(top_x["ends"]).type(torch.cuda.LongTensor)) top_gold = autograd.Variable( torch.from_numpy(top_x["top_gold"]).type( torch.cuda.FloatTensor)) anaphoricity_gold = anaphoricity_target.tolist() ana_lable = autograd.Variable( torch.cuda.FloatTensor([anaphoricity_gold])) optimizer.zero_grad() output, output_reindex = network_model.forward_top_pair( word_embedding_dimention, mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, reindex, start_index, end_index, dropout_rate) loss = F.binary_cross_entropy( output, top_gold, size_average=False) / train_docs.scale_factor_top ana_output, _ = network_model.forward_anaphoricity( word_embedding_dimention, anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) ana_loss = F.binary_cross_entropy( ana_output, ana_lable, size_average=False) / train_docs.anaphoricity_scale_factor_top loss_all = loss + ana_loss loss_all.backward() pair_cost_this_turn += loss.data[0] optimizer.step() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain", echo, "Pair total cost:", pair_cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "Learning Rate", lr print >> sys.stderr, "save model ..." torch.save(network_model, model_save_dir + "network_model_pretrain.%d.top" % echo) #if cost_this_turn > last_cost: # lr = lr*0.7 gold = [] predict = [] ana_gold = [] ana_predict = [] for data in dev_docs.train_generater(shuffle=False, top=True): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative, anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target, top_x = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) reindex = autograd.Variable( torch.from_numpy(top_x["score_index"]).type( torch.cuda.LongTensor)) start_index = autograd.Variable( torch.from_numpy(top_x["starts"]).type(torch.cuda.LongTensor)) end_index = autograd.Variable( torch.from_numpy(top_x["ends"]).type(torch.cuda.LongTensor)) gold += top_x["top_gold"].tolist() ana_gold += anaphoricity_target.tolist() output, output_reindex = network_model.forward_top_pair( word_embedding_dimention, mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, reindex, start_index, end_index, 0.0) predict += output.data.cpu().numpy().tolist() ana_output, _ = network_model.forward_anaphoricity( word_embedding_dimention, anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0) ana_predict += ana_output.data.cpu().numpy()[0].tolist() gold = numpy.array(gold, dtype=numpy.int32) predict = numpy.array(predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } thresh_list = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6] for thresh in thresh_list: evaluation_results = get_metrics(gold, predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Pair accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if best_results["f1"] > all_best_results["f1"]: all_best_results = best_results print >> sys.stderr, "New High Result, Save Model" torch.save(network_model, model_save_dir + "network_model_pretrain.top.best") ana_gold = numpy.array(ana_gold, dtype=numpy.int32) ana_predict = numpy.array(ana_predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } for thresh in thresh_list: evaluation_results = get_metrics(ana_gold, ana_predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if (echo + 1) % 10 == 0: best_network_model = torch.load(model_save_dir + "network_model_pretrain.top.best") print "DEV:" performance.performance(dev_docs, best_network_model) print "TEST:" performance.performance(test_docs, best_network_model)
def ShowResult(net, X, Y, title, wb1, wb2): # draw train data plt.plot(X[0, :], Y[0, :], '.', c='b') # create and draw visualized validation data TX = np.linspace(0, 1, 100).reshape(1, 100) dict_cache = net.ForwardCalculationBatch(TX, wb1, wb2) TY = dict_cache["Output"] plt.plot(TX, TY, 'x', c='r') plt.title(title) plt.show() #end def if __name__ == '__main__': dataReader = DataReader(x_data_name, y_data_name) dataReader.ReadData() dataReader.NormalizeX() dataReader.NormalizeY() n_input, n_hidden, n_output = 1, 4, 1 eta, batch_size, max_epoch = 0.5, 10, 50000 eps = 0.001 params = CParameters(n_input, n_hidden, n_output, eta, max_epoch, batch_size, eps) # SGD, MiniBatch, FullBatch loss_history = CLossHistory() net = TwoLayerFittingNet() wb1, wb2 = net.train(dataReader, params, loss_history)
temp = [] for a in anchors[1]: temp.append(linalg.norm(X - a, axis=1, ord=1)) temp = np.array(temp) mins2 = np.min(temp, axis=0) mins2 = mins2 / np.std(mins2) nz = [mins1 != 0, mins2 != 0] d = np.all(nz, axis=0) mins1[mins1 == 0] = np.mean(mins1[d]) mins2[mins2 == 0] = np.mean(mins2[d]) # end: argmin X1 = np.hstack((X, mins1.reshape((len(X), 1)))) return np.hstack((X1, mins2.reshape((len(X), 1)))) X, y = DR.fourclass() y_unique = np.unique(y) random_state = 42 cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state) cv_acc_training, cv_acc_test, cv_training_time = [], [], [] XD_k = [] y_k = [] # 10-fold stratified cross validation for o_train_index, o_test_index in cv_outer.split(X, y): X_train_k, y_train_k, X_test_k, y_test_k = X[o_train_index], y[ o_train_index], X[o_test_index], y[o_test_index] sets_of_anchors = [] sets_of_anchors.append(find_anchors_from_class_0(X_train_k, y_train_k))
xa = np.linspace(x_range[1], x_range[0], num=x_num, endpoint=True) za = np.linspace(z_range[1], z_range[0], num=z_num, endpoint=True) x2d, z2d = np.meshgrid(xa, za) if specimen_name == 'al7075_mlf': xa = np.linspace(x_range[0], x_range[1], num=x_num, endpoint=True) za = np.linspace(z_range[0], z_range[1], num=z_num, endpoint=True) x2d, z2d = np.meshgrid(xa, za) x1d, z1d = x2d.flatten(), z2d.flatten() #%% read peak diameters if they have been fit, if not fit here orient = 'h' try: x, z, diams = [], [], [] for i_step in range(sample.n_load_step): txt_data = DataReader.read_data_from_text(ring.out_dir+sample.step_names[i_step]+'_diams_'+orient+'.txt') x.append(txt_data[:, 0]), z.append(txt_data[:, 1]), diams.append(txt_data[:, 2]) except: l_centers, l_errs = np.zeros((sample.n_load_step, sample.n_data_pt)), np.zeros((sample.n_load_step, sample.n_data_pt)) u_centers, u_errs = np.zeros((sample.n_load_step, sample.n_data_pt)), np.zeros((sample.n_load_step, sample.n_data_pt)) diams = np.zeros((sample.n_load_step, sample.n_data_pt)) for i_step in range(sample.n_load_step): l_centers[i_step,:], l_errs[i_step,:], u_centers[i_step,:], u_errs[i_step,:], diams[i_step,:] = DataAnalysis.write_scan_diameters(sample, ring, x1d, z1d, i_step, orient) #%% # total variation filtering fits, coords = [], [] for i_step in range(sample.n_load_step): s_fits, s_coords = [], []
def main(): path = "/Users/u15672269/stat" data_path = "/Users/u15672269/Desktop/For_Kseniya/однородность.xls" title = "Отчет о показателях качества тестовых заданий по курсу Информатика 2018-2019 учебного года 1 семестра" KO_I = True KO_II = True correlation = True report = Document() report.add_heading(title, 0) if (KO_I or KO_II or correlation): dictionary = DataReader.read_dictionary_from_excel(data_path) data = DataReader.read_raw_data_from_excel(data_path, dictionary) data_KO = [] keys = [] # состав вопросов в тесте test = {} for i in data: if i[2] != '': question = dictionary[i[0]][0] val = test.get(question[0]) if val is None: test[question[0]] = list() test[question[0]].append(question[1]) else: if question[1] not in test[question[0]]: test[question[0]].append(question[1]) key = (question, i[1], i[2]) if key not in keys: count = sum(elem[0] == i[0] and elem[1] == key[1] and elem[2] == key[2] for elem in data) data_KO.append([i[0], i[1], i[2], count, i[4], i[5]]) keys.append(key) print("ok") if KO_I: print("KO_I processing started") formulation_stat = Stat.get_question_formulation_stat( Stat.count_formulation_stat(data_KO, dictionary)) formulation_homogeneity = {} for key, question_stat in formulation_stat.items(): formulation_homogeneity[key] = Stat.test_formulation_homogeneity( question_stat) DataPrinter.create_report_KO_I(report, formulation_homogeneity, path) print("KO_I processing finished") if KO_II: print("KO_II processing started") distractor_frequency_stat = Stat.get_distractor_frequency_stat( data_KO, dictionary) distractor_homogeneity = Stat.test_distractor_homogeneity( distractor_frequency_stat, 0.05, 0, 100) DataPrinter.create_report_KO_II(report, distractor_frequency_stat, distractor_homogeneity, path) print("KO_II processing finished") if correlation: print("correlation processing started") correlation_stat = Stat.get_correlation_matrix( test, Stat.group_stat_by_student(data, dictionary)) DataPrinter.create_report_correlation(report, correlation_stat, path) print("correlation processing finished") report.save(os.path.join(path, '{}.docx'.format(title))) return
def main(): DIR = args.DIR embedding_file = args.embedding_dir embedding_matrix = numpy.load(embedding_file) "Building torch model" network_model = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." train_docs = DataReader.DataGnerater("train" + reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs = DataReader.DataGnerater("dev" + reduced) test_docs = DataReader.DataGnerater("test" + reduced) l2_lambda = 1e-6 #lr = 0.00009 lr = 0.0001 dropout_rate = 0.5 shuffle = True times = 0 best_thres = 0.5 model_save_dir = "./model/" last_cost = 0.0 all_best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps=1e-5) scheduler = lr_scheduler.StepLR(optimizer, step_size=75, gamma=0.5) for echo in range(100): start_time = timeit.default_timer() print "Pretrain Epoch:", echo scheduler.step() pair_cost_this_turn = 0.0 ana_cost_this_turn = 0.0 pair_nums = 0 ana_nums = 0 inside_time = 0.0 for data in train_docs.train_generater(shuffle=shuffle): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative,anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) gold = target.tolist() anaphoricity_gold = anaphoricity_target.tolist() pair_nums += len(gold) ana_nums += len(anaphoricity_gold) lable = autograd.Variable(torch.cuda.FloatTensor([gold])) ana_lable = autograd.Variable( torch.cuda.FloatTensor([anaphoricity_gold])) output, _ = network_model.forward_all_pair( nnargs["word_embedding_dimention"], mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, dropout_rate) ana_output, _ = network_model.forward_anaphoricity( nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) optimizer.zero_grad() #loss = get_pair_loss(output,positive,negative,train_docs.scale_factor) loss = F.binary_cross_entropy( output, lable, size_average=False) / train_docs.scale_factor #ana_loss = F.binary_cross_entropy(ana_output,ana_lable,size_average=False)/train_docs.anaphoricity_scale_factor pair_cost_this_turn += loss.data[0] * train_docs.scale_factor loss_all = loss loss_all.backward() optimizer.step() end_time = timeit.default_timer() print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "Learning Rate", lr #print >> sys.stderr,"save model ..." #torch.save(network_model, model_save_dir+"network_model_pretrain.%d"%echo) gold = [] predict = [] ana_gold = [] ana_predict = [] for data in dev_docs.train_generater(shuffle=False): mention_word_index, mention_span, candi_word_index,candi_span,feature_pair,pair_antecedents,pair_anaphors,\ target,positive,negative, anaphoricity_word_indexs, anaphoricity_spans, anaphoricity_features, anaphoricity_target = data mention_index = autograd.Variable( torch.from_numpy(mention_word_index).type( torch.cuda.LongTensor)) mention_span = autograd.Variable( torch.from_numpy(mention_span).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable( torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable( torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable( torch.from_numpy(feature_pair).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable( torch.from_numpy(pair_anaphors).type(torch.cuda.LongTensor)) antecedents = autograd.Variable( torch.from_numpy(pair_antecedents).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable( torch.from_numpy(anaphoricity_word_indexs).type( torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable( torch.from_numpy(anaphoricity_spans).type( torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable( torch.from_numpy(anaphoricity_features).type( torch.cuda.FloatTensor)) gold += target.tolist() ana_gold += anaphoricity_target.tolist() output, _ = network_model.forward_all_pair( nnargs["word_embedding_dimention"], mention_index, mention_span, candi_index, candi_spans, pair_feature, anaphors, antecedents, 0.0) predict += output.data.cpu().numpy()[0].tolist() ana_output, _ = network_model.forward_anaphoricity( nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0) ana_predict += ana_output.data.cpu().numpy()[0].tolist() gold = numpy.array(gold, dtype=numpy.int32) predict = numpy.array(predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } thresh_list = [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6] for thresh in thresh_list: evaluation_results = get_metrics(gold, predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Pair accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if best_results["f1"] >= all_best_results["f1"]: all_best_results = best_results print >> sys.stderr, "New High Result, Save Model" torch.save(network_model, model_save_dir + "network_model_pretrain.best.pair") sys.stdout.flush() ## output best print "In sum, anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush()
def setup(): bucket.setBucket(DataReader.loadBucket()) wordlist.extend(DataReader.collocationEntries()) for w in wordlist: addStress(tokenize(w))
if Y[0, i] == 1: plt.plot(X[0, i], X[1, i], '^', c='g') elif Y[0, i] == 2: plt.plot(X[0, i], X[1, i], 'x', c='r') elif Y[0, i] == 3: plt.plot(X[0, i], X[1, i], '.', c='b') # end if # end for plt.xlabel("x1") plt.ylabel("x2") plt.show() if __name__ == '__main__': dataReader = DataReader(x_data_name, y_data_name) dataReader.ReadData() X = dataReader.NormalizeX() Y = dataReader.ToOneHot() n_input, n_output = dataReader.num_feature, dataReader.num_category n_hidden = 8 eta, batch_size, max_epoch = 0.1, 10, 5000 eps = 0.06 params = CParameters(n_input, n_hidden, n_output, eta, max_epoch, batch_size, eps, LossFunctionName.CrossEntropy3) loss_history = CLossHistory() net = TwoLayerClassificationNet()
import DataReader as DR from sklearn import svm from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression as LR from sklearn.neighbors import KNeighborsClassifier #Read training data Ang_Songs = DR.readData("Data-Set/Angry/Train/", "angry") Hap_Songs = DR.readData("Data-Set/Happy/Train/", "happy") Sad_Songs = DR.readData("Data-Set/Sad/Train/", "sad") #Rel_Songs=DR.readData("Data-Set/Relaxed/Train/","relaxed") SongsTrain = [Ang_Songs, Hap_Songs, Sad_Songs] #ReadTestingData AngT_Songs = DR.readData("Data-Set/Angry/Test/", "angry") HapT_Songs = DR.readData("Data-Set/Happy/Test/", "happy") SadT_Songs = DR.readData("Data-Set/Sad/Test/", "sad") #RelT_Songs=DR.readData("Data-Set/Relaxed/Test/","relaxed") SongsTTrain = [AngT_Songs, HapT_Songs, SadT_Songs] SongsWordsTrain = [[], []] for i in range(3): for song in SongsTrain[i]: # print("\nsongs train[i]=") # print(SongsTrain[i]) # print("\n\nnow song\n") # print(song) # print("\n\nPrinting s for train\n"); s = song[4] #lyrics text only # print(s)
def __init__(self, learning_rate, training_iteration, batch_size, hidden_layer_n, hidden_layer_n2): self.dataset = DataReader.data_set( ) #Acquiring data from the DataReader class self.learning_rate = learning_rate #Learning rate at which weights and biases get adjusted self.training_iteration = training_iteration #Number of iterations the NN trains for self.batch_size = batch_size #Size of the batch of the data that gets fed to the neural network self.display_step = 4 #Display every 4th iteration input_layer_n = self.dataset.input_dim #Input layer matrix output_layer_n = self.dataset.output_dim #Output layer matrix self.x = tf.placeholder( "float", [None, input_layer_n], name="x") #Creating place holder for the input matrix self.y = tf.placeholder( "float", [None, output_layer_n], name="y") #Creating place holder for the output matrix with tf.name_scope( "weights" ) as scope: #Creating weight matrices and populating them with random numbers W1 = tf.Variable( tf.random_normal([input_layer_n, hidden_layer_n], stddev=0.1)) W2 = tf.Variable( tf.random_normal([hidden_layer_n, hidden_layer_n2], stddev=0.1)) W3 = tf.Variable( tf.random_normal([hidden_layer_n2, output_layer_n], stddev=0.1)) with tf.name_scope( "biases" ) as scope: #Creating bias matrices and populating them with random numbers b1 = tf.Variable(tf.random_normal([hidden_layer_n], stddev=0.1)) b2 = tf.Variable(tf.random_normal([hidden_layer_n2], stddev=0.1)) b3 = tf.Variable(tf.random_normal([output_layer_n], stddev=0.1)) with tf.name_scope( "model") as scope: #Creating the three output layers layer_1 = tf.nn.sigmoid( tf.matmul(self.x, W1) + b1 ) #sigmoid(W[0,0]*i[0] + W[0,1]*i[1] + W[0,2]i[2] + ... + W[0,n]i[0] + b[i]) layer_2 = tf.nn.sigmoid( tf.matmul(layer_1, W2) + b2 ) #sigmoid(W[0,0]*i[0] + W[0,1]*i[1] + W[0,2]i[2] + ... + W[0,n]i[0] + b[i]) layer_3 = tf.nn.softmax( tf.matmul(layer_2, W3) + b3 ) #softmax(W[0,0]*i[0] + W[0,1]*i[1] + W[0,2]i[2] + ... + W[0,n]i[0] + b[i]) self.model = layer_3 with tf.name_scope( "objective_function" ) as scope: #Objective or (activation) function: root mean squared self.objective_function = tf.sqrt( tf.reduce_sum(tf.square(tf.subtract(self.model, self.y)))) with tf.name_scope( "train" ) as scope: #Using Gradient Descent to minimize the cost of the function with respect to weights and biases self.optimizer = tf.train.GradientDescentOptimizer( learning_rate).minimize(self.objective_function) self.init = tf.global_variables_initializer() self.merged_summary_op = tf.summary.merge_all()
def ShowResult(net, X, Y, title, wb1, wb2): # draw train data plt.plot(X[0,:], Y[0,:], '.', c='b') # create and draw visualized validation data TX = np.linspace(0,1,100).reshape(1,100) dict_cache = net.ForwardCalculationBatch(TX, wb1, wb2) TY = dict_cache["Output"] plt.plot(TX, TY, 'x', c='r') plt.title(title) plt.show() if __name__ == '__main__': dataReader = DataReader(x_data_name, y_data_name) XData,YData = dataReader.ReadData() X = dataReader.NormalizeX(passthrough=True) Y = dataReader.NormalizeY() # 为了说明问题,我们用2个隐层单元和20批大小来做试验 n_input, n_hidden, n_output = 1, 4, 1 eta, batch_size, max_epoch = 0.1, 1, 10000 eps = 0.001 params = CParameters(n_input, n_hidden, n_output, eta, max_epoch, batch_size, eps, LossFunctionName.MSE, InitialMethod.Xavier) loss_history = CLossHistory(params)
import PIL from PIL import Image import numpy as np import DataReader w, h = 28, 28 m = DataReader.get_mapping() data = DataReader.get_images(10, h, w) # 112800 images in data set for image in data: print("\nCharacter being shown: " + chr(m[image[0]])) img = Image.fromarray(np.array(image[1], dtype=np.uint8)) img.show() input()
def convertTargetFeatureToNumeric(data): categoryDictionary = dataReader.getCategoryDictionaries() data = data.replace(categoryDictionary.keys(), range(len(categoryDictionary.keys()))) return data
# bert-serving-start -model_dir D:/model/multi_cased_L-12_H-768_A-12/ -max_seq_len 128 -pooling_strategy NONE -show_tokens_to_client -cased_tokenization from bert_serving.client import BertClient bc = BertClient(ip='localhost') train_file = 'D:/data/cmrc_squad/cmrc2018_trial.json' test_file = 'D:/data/cmrc_squad/cmrc2018_trial.json' max_query_length = 64 max_seq_length = 128 doc_stride = 128 batch_size = 16 hidden_size = 768 num_epoch = 2 # 0.0001 init_lr = 3e-2 tf.logging.set_verbosity(tf.logging.INFO) train_data = data_reader.read_squad_examples(train_file, True) test_data = data_reader.read_squad_examples(test_file, True) test_data = test_data[0:4] train_data_collector = [] test_data_collector = [] data_reader.convert_examples_to_features(train_data, max_query_length=max_query_length, max_seq_length=max_seq_length, doc_stride=doc_stride, is_training=True, data_collector=train_data_collector, bert_client=bc) data_reader.convert_examples_to_features(test_data, max_query_length=max_query_length, max_seq_length=max_seq_length, doc_stride=doc_stride,
""" Tester file: This file will run all of the important parts of the project. Team: Belinda Adam, Jacquelyn Haughey Machine Learning 2015 Final Project Phonological Learning of English Pronunciation """ import DataReader as dataReader import Network # read data from the nettalk data set f = "nettalk.data.txt" examples, words, prons = dataReader.readDataFile("random_100_train.txt") examples2, words2, prons2 = dataReader.readDataFile("random_100_test.txt") train = examples test = examples2 network = Network.Network(train, test, 1, 120, 27, 53, 0.5, 1) network.train_network() network.test_network()
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/network_model_pretrain.best" print >> sys.stderr,"Read model from",best_network_file best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" network_model = network.Network(nnargs["pair_feature_dimention"],nnargs["mention_feature_dimention"],nnargs["word_embedding_dimention"],nnargs["span_dimention"],1000,nnargs["embedding_size"],nnargs["embedding_dimention"],embedding_matrix).cuda() print >> sys.stderr,"save model ..." net_copy(network_model,best_network_model) reduced="" if args.reduced == 1: reduced="_reduced" print >> sys.stderr,"prepare data for train ..." train_docs = DataReader.DataGnerater("train"+reduced) print >> sys.stderr,"prepare data for dev and test ..." dev_docs = DataReader.DataGnerater("dev"+reduced) test_docs = DataReader.DataGnerater("test"+reduced) l2_lambda = 1e-6 lr = nnargs["lr"] dropout_rate = nnargs["dropout_rate"] epoch = nnargs["epoch"] model_save_dir = "./model/bp/" last_cost = 0.0 all_best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } optimizer = optim.RMSprop(network_model.parameters(), lr=lr, eps=1e-5) scheduler = lr_scheduler.StepLR(optimizer, step_size=75, gamma=0.5) for echo in range(epoch): start_time = timeit.default_timer() print "Pretrain Epoch:",echo scheduler.step() pair_cost_this_turn = 0.0 ana_cost_this_turn = 0.0 pair_nums = 0 ana_nums = 0 for data in train_docs.train_generater(shuffle=True): mention_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor)) mention_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(data["candi_word_index"]).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(data["candi_span"]).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(data["pair_features"]).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(data["pair_anaphors"]).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(data["pair_antecedents"]).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(data["anaphoricity_feature"]).type(torch.cuda.FloatTensor)) reindex = autograd.Variable(torch.from_numpy(data["top_score_index"]).type(torch.cuda.LongTensor)) start_index = autograd.Variable(torch.from_numpy(data["top_starts"]).type(torch.cuda.LongTensor)) end_index = autograd.Variable(torch.from_numpy(data["top_ends"]).type(torch.cuda.LongTensor)) top_gold = autograd.Variable(torch.from_numpy(data["top_gold"]).type(torch.cuda.FloatTensor)) anaphoricity_target = data["anaphoricity_target"] anaphoricity_gold = anaphoricity_target.tolist() ana_lable = autograd.Variable(torch.cuda.FloatTensor([anaphoricity_gold])) optimizer.zero_grad() output,output_reindex = network_model.forward_top_pair(nnargs["word_embedding_dimention"],mention_index,mention_span,candi_index,candi_spans,pair_feature,anaphors,antecedents,reindex,start_index,end_index,dropout_rate) loss = F.binary_cross_entropy(output,top_gold,size_average=False)/train_docs.scale_factor_top ana_output,_,_ = network_model.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, dropout_rate) ana_loss = F.binary_cross_entropy(ana_output,ana_lable,size_average=False)/train_docs.anaphoricity_scale_factor_top loss_all = loss + ana_loss loss_all.backward() pair_cost_this_turn += loss.data[0] optimizer.step() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain",echo,"Pair total cost:",pair_cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time) print >> sys.stderr, "Learning Rate",lr gold = [] predict = [] ana_gold = [] ana_predict = [] for data in dev_docs.train_generater(shuffle=False): mention_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor)) mention_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(data["candi_word_index"]).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(data["candi_span"]).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(data["pair_features"]).type(torch.cuda.FloatTensor)) anaphors = autograd.Variable(torch.from_numpy(data["pair_anaphors"]).type(torch.cuda.LongTensor)) antecedents = autograd.Variable(torch.from_numpy(data["pair_antecedents"]).type(torch.cuda.LongTensor)) anaphoricity_index = autograd.Variable(torch.from_numpy(data["mention_word_index"]).type(torch.cuda.LongTensor)) anaphoricity_span = autograd.Variable(torch.from_numpy(data["mention_span"]).type(torch.cuda.FloatTensor)) anaphoricity_feature = autograd.Variable(torch.from_numpy(data["anaphoricity_feature"]).type(torch.cuda.FloatTensor)) reindex = autograd.Variable(torch.from_numpy(data["top_score_index"]).type(torch.cuda.LongTensor)) start_index = autograd.Variable(torch.from_numpy(data["top_starts"]).type(torch.cuda.LongTensor)) end_index = autograd.Variable(torch.from_numpy(data["top_ends"]).type(torch.cuda.LongTensor)) top_gold = autograd.Variable(torch.from_numpy(data["top_gold"]).type(torch.cuda.FloatTensor)) anaphoricity_target = data["anaphoricity_target"] anaphoricity_gold = anaphoricity_target.tolist() ana_lable = autograd.Variable(torch.cuda.FloatTensor([anaphoricity_gold])) gold += data["top_gold"].tolist() ana_gold += anaphoricity_target.tolist() output,output_reindex = network_model.forward_top_pair(nnargs["word_embedding_dimention"],mention_index,mention_span,candi_index,candi_spans,pair_feature,anaphors,antecedents,reindex,start_index,end_index,0.0) predict += output.data.cpu().numpy().tolist() ana_output,_,_ = network_model.forward_anaphoricity(nnargs["word_embedding_dimention"], anaphoricity_index, anaphoricity_span, anaphoricity_feature, 0.0) ana_predict += ana_output.data.cpu().numpy()[0].tolist() gold = numpy.array(gold,dtype=numpy.int32) predict = numpy.array(predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } thresh_list = [0.3,0.35,0.4,0.45,0.5,0.55,0.6] for thresh in thresh_list: evaluation_results = get_metrics(gold, predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Pair accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if best_results["f1"] >= all_best_results["f1"]: all_best_results = best_results print >> sys.stderr, "New High Result, Save Model" torch.save(network_model, model_save_dir+"network_model_pretrain.best.top") ana_gold = numpy.array(ana_gold,dtype=numpy.int32) ana_predict = numpy.array(ana_predict) best_results = { 'thresh': 0.0, 'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0 } for thresh in thresh_list: evaluation_results = get_metrics(ana_gold, ana_predict, thresh) if evaluation_results["f1"] >= best_results["f1"]: best_results = evaluation_results print "Anaphoricity accuracy: %f and Fscore: %f with thresh: %f"\ %(best_results["accuracy"],best_results["f1"],best_results["thresh"]) sys.stdout.flush() if (echo+1)%10 == 0: best_network_model = torch.load(model_save_dir+"network_model_pretrain.best.top") print "DEV:" performance.performance(dev_docs,best_network_model) print "TEST:" performance.performance(test_docs,best_network_model)
def performValidation(yPred, yTest): dictionary = dataReader.getCategoryDictionaries() print(metrics.classification_report(yPred, yTest, target_names=dictionary.keys()))
from DataReader import * from AGDSStructure import * from AGDSKNearest import * import numpy as np def classify(data_holder, model, X): predicted_label = model.find_similarity(np.array(X)) win_class = data_holder.get_real_label(predicted_label) print(win_class) if __name__ == '__main__': data_reader = DataReader("IrisData.xls") agds_structure = AGDSStructure(data_reader.data_frame, data_reader.label) k_nearest = AGDSKNearest(agds_structure, 3) classify(data_reader, k_nearest, [4.5, 3.0, 1.1, 0.1]) classify(data_reader, k_nearest, [7.0, 3.2, 4.7, 1.4]) classify(data_reader, k_nearest, [5.0, 2.0, 4.0, 1.0]) classify(data_reader, k_nearest, [5.7, 2.5, 4.8, 1.6])
def setup(self): prov_niscode_shapes = {} for record in dr.read_shp('/home/techpriest/Desktop/becode/SpaceEYE/ADM/Apn_AdPr.shp').to_records(): prov_niscode_shapes[record[3]] = record[-1] return prov_niscode_shapes
print("Solving it with Tabulation") t0 = time.time() value = tabulation(s1, s2) t1 = time.time() elif args.check: mem = memoization(s1, s2) tab = tabulation(s1, s2) print("Check:", (mem == tab)) print("Memoization =", mem) print("Tabulation =", tab) else: print("Introduzca un metodo, por favor") exit(-1) if args.showValue and not args.check: print("Value =", value) if args.timer: t = (t1 - t0) print("Time =", round(t, 3), "s") if __name__ == '__main__': args = args_creator() if args.directory != "": files = dr.readAllFiles(args.directory) withDirectory(files) elif args.file != "": withFile(args.file) else: print("Introduzca un fichero, por favor") exit(-1)
#TrendsScraper #YahooFinanceScraper import DataReader from pandas.io.data from datetime import datetime goog = DataReader("GOOG", "yahoo", datetime(2000,1,1), datetime(2012,1,1)) goog["Adj Close"]
logs = os.path.join(directory, 'logs') trainloss = os.path.join(logs, 'train_loss.txt') if os.path.isdir(logs) == False: os.makedirs(logs) # choose network, can be either DRN18 or DRN26 network = 'DRN26' # set parameters batch_size = 8 num_epochs = 100 use_weights = 1 num_classes = 5 image_dims = [500, 500, 3] data = DataReader(directory, batch_size, num_epochs, use_weights=1) train_data = data.train_batch(train_file) num_train_images = data.num_images test_data = data.test_batch(test_file) num_val_images = data.num_images # determine number of iterations based on number of images training_iterations = int(np.floor(num_train_images / batch_size)) validation_iterations = int(np.floor(num_val_images / batch_size)) handle = tf.placeholder(tf.string, shape=[]) # create iterator allowing us to switch between datasets iterator = tf.data.Iterator.from_string_handle(handle, train_data.output_types, train_data.output_shapes) next_element = iterator.get_next()
self.index = np.arange(self.maxlen) np.random.shuffle(self.index) self.cnt = 0 return self def __next__(self): if self.cnt == self.maxlen: raise StopIteration self.cnt += self.batch return self.data[self.index[self.cnt - self.batch: self.cnt], :], \ self.label[self.index[self.cnt - self.batch: self.cnt], :] def next(self): return self.__next__() if __name__ == "__main__": print tf.__version__ sys.path.append("../") import DataReader tdata = DataReader.ImageReader("../dataset/train-images-idx3-ubyte.gz").to_tensor() ldata = DataReader.LabelReader("../dataset/train-labels-idx1-ubyte.gz").to_tensor() print tdata.shape print ldata.shape tf_mlp = TFMLP(tdata, ldata) tf_mlp.train() ttest = DataReader.ImageReader("../dataset/t10k-images-idx3-ubyte.gz").to_tensor() ltest = DataReader.LabelReader("../dataset/t10k-labels-idx1-ubyte.gz").to_tensor() tf_mlp.test(ttest, ltest)
import DataReader as DR from sklearn import svm from nltk.tokenize import word_tokenize #from nltk import FreqDist from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer #import random import numpy as np import pandas as pd import re from sklearn.naive_bayes import MultinomialNB #Read training data Ang_Songs=DR.readData("Data-Set/Angry/Train/","angry") Hap_Songs=DR.readData("Data-Set/Happy/Train/","happy") Sad_Songs=DR.readData("Data-Set/Sad/Train/","sad") Rel_Songs=DR.readData("Data-Set/Relaxed/Train/","relaxed") SongsTrain=[Ang_Songs,Hap_Songs,Sad_Songs,Rel_Songs] # PROCESSING TRAINING DATA #tokenizing training data sw = list(stopwords.words("english")) lemmatizer=WordNetLemmatizer() def my_tokenizer(s): s = s.lower() # downcase tokens = word_tokenize(s) # split string into words (tokens) tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful tokens = [lemmatizer.lemmatize(t) for t in tokens] # put words into base form tokens = [t for t in tokens if t not in sw] # remove stopwords
def main(): DIR = args.DIR embedding_file = args.embedding_dir best_network_file = "./model/network_model_pretrain.best.top" print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) embedding_matrix = numpy.load(embedding_file) "Building torch model" worker = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(worker, best_network_model) best_network_file = "./model/network_model_pretrain.best.top" print >> sys.stderr, "Read model from ", best_network_file best_network_model = torch.load(best_network_file) manager = network.Network( nnargs["pair_feature_dimention"], nnargs["mention_feature_dimention"], nnargs["word_embedding_dimention"], nnargs["span_dimention"], 1000, nnargs["embedding_size"], nnargs["embedding_dimention"], embedding_matrix).cuda() net_copy(manager, best_network_model) reduced = "" if args.reduced == 1: reduced = "_reduced" print >> sys.stderr, "prepare data for train ..." train_docs_iter = DataReader.DataGnerater("train" + reduced) #train_docs_iter = DataReader.DataGnerater("dev"+reduced) print >> sys.stderr, "prepare data for dev and test ..." dev_docs_iter = DataReader.DataGnerater("dev" + reduced) test_docs_iter = DataReader.DataGnerater("test" + reduced) ''' print "Performance after pretraining..." print "DEV" metric = performance.performance(dev_docs_iter,worker,manager) print "Average:",metric["average"] print "TEST" metric = performance.performance(test_docs_iter,worker,manager) print "Average:",metric["average"] print "***" print sys.stdout.flush() ''' lr = nnargs["lr"] top_k = nnargs["top_k"] model_save_dir = "./model/reinforce/" utils.mkdir(model_save_dir) score_softmax = nn.Softmax() optimizer_manager = optim.RMSprop(manager.parameters(), lr=lr, eps=1e-6) optimizer_worker = optim.RMSprop(worker.parameters(), lr=lr, eps=1e-6) MAX_AVE = 2048 for echo in range(nnargs["epoch"]): start_time = timeit.default_timer() print "Pretrain Epoch:", echo reward_log = Logger(Tensorboard + args.tb + "/acl2018/%d/reward/" % echo, flush_secs=3) entropy_log_manager = Logger(Tensorboard + args.tb + "/acl2018/%d/entropy/worker" % echo, flush_secs=3) entropy_log_worker = Logger(Tensorboard + args.tb + "/acl2018/%d/entropy/manager" % echo, flush_secs=3) train_docs = utils.load_pickle(args.DOCUMENT + 'train_docs.pkl') #train_docs = utils.load_pickle(args.DOCUMENT + 'dev_docs.pkl') docs_by_id = {doc.did: doc for doc in train_docs} ave_reward = [] ave_manager_entropy = [] ave_worker_entropy = [] print >> sys.stderr, "Link docs ..." tmp_data = [] cluster_info = {0: [0]} cluster_list = [0] current_new_cluster = 1 predict_action_embedding = [] choose_action = [] mid = 1 step = 0 statistic = { "worker_hits": 0, "manager_hits": 0, "total": 0, "manager_predict_last": 0, "worker_predict_last": 0 } for data in train_docs_iter.rl_case_generater(shuffle=True): rl = data["rl"] scores_manager, representations_manager = get_score_representations( manager, data) scores_worker, representations_worker = get_score_representations( worker, data) for s, e in zip(rl["starts"], rl["ends"]): #action_embeddings = representations_manager[s:e] #probs = F.softmax(torch.squeeze(scores_manager[s:e])) action_embeddings = representations_worker[s:e] probs = F.softmax(torch.squeeze( scores_worker[s:e])).data.cpu().numpy() #m = Categorical(F.softmax(torch.squeeze(scores_worker[s:e]))[:-1]) #a = m.sample() #this_action = m.sample() #index = this_action.data.cpu().numpy()[0] index = utils.choose_action(probs) if index == (e - s - 1): should_cluster = current_new_cluster cluster_info[should_cluster] = [] current_new_cluster += 1 else: should_cluster = cluster_list[index] choose_action.append(index) cluster_info[should_cluster].append(mid) cluster_list.append(should_cluster) mid += 1 cluster_indexs = torch.cuda.LongTensor( cluster_info[should_cluster]) action_embedding_predict_ave = torch.mean( action_embeddings[cluster_indexs], 0, keepdim=True) action_embedding_predict_max, max_index = torch.max( action_embeddings[cluster_indexs], dim=0, keepdim=True) action_embedding_predict = torch.cat( (action_embedding_predict_ave, action_embedding_predict_max), 1) predict_action_embedding.append(action_embedding_predict) tmp_data.append(data) if rl["end"] == True: inside_index = 0 manager_path = [] worker_path = [] doc = docs_by_id[rl["did"]] for data in tmp_data: rl = data["rl"] pair_target = data["pair_target"] anaphoricity_target = 1 - data["anaphoricity_target"] target = numpy.concatenate( (pair_target, anaphoricity_target))[rl["reindex"]] scores_worker, representations_worker = get_score_representations( worker, data) for s, e in zip(rl["starts"], rl["ends"]): action_embeddings = representations_worker[s:e] probs = F.softmax( torch.squeeze(scores_worker[s:e]) ).data.cpu().numpy( ) #print probs.data.cpu().numpy() -> [ 3.51381488e-04 9.99648571e-01] action_embedding_predicted = predict_action_embedding[ inside_index] combine_embedding = torch.cat( (action_embeddings, action_embeddings), 1) similarities = torch.sum( torch.abs(combine_embedding - action_embedding_predicted), 1) similarities = similarities.data.cpu().numpy() action_probabilities = [] action_list = [] similarity_candidates = heapq.nlargest( top_k, -similarities) for similarity in similarity_candidates: action_index = numpy.argwhere( similarities == -similarity)[0][0] action_probabilities.append(probs[action_index]) action_list.append(action_index) manager_action = choose_action[inside_index] if not manager_action in action_list: action_list.append(manager_action) action_probabilities.append(probs[manager_action]) sample_action = utils.sample_action( numpy.array(action_probabilities)) worker_action = action_list[sample_action] this_target = target[s:e] if this_target[worker_action] == 1: statistic["worker_hits"] += 1 if this_target[manager_action] == 1: statistic["manager_hits"] += 1 if worker_action == (e - s - 1): statistic["worker_predict_last"] += 1 if manager_action == (e - s - 1): statistic["manager_predict_last"] += 1 statistic["total"] += 1 inside_index += 1 #link = manager_action link = worker_action m1, m2 = rl['ids'][s + link] doc.link(m1, m2) manager_path.append(manager_action) worker_path.append(worker_action) reward = doc.get_f1() for data in tmp_data: for s, e in zip(rl["starts"], rl["ends"]): ids = rl['ids'][s:e] ana = ids[0, 1] old_ant = doc.ana_to_ant[ana] doc.unlink(ana) costs = rl['costs'][s:e] for ant_ind in range(e - s): costs[ant_ind] = doc.link(ids[ant_ind, 0], ana, hypothetical=True, beta=1) doc.link(old_ant, ana) #costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) inside_index = 0 worker_entropy = 0.0 for data in tmp_data: new_step = step # worker scores_worker, representations_worker = get_score_representations( worker, data, dropout=nnargs["dropout_rate"]) optimizer_worker.zero_grad worker_loss = None for s, e in zip(rl["starts"], rl["ends"]): costs = rl['costs'][s:e] costs = autograd.Variable( torch.from_numpy(costs).type( torch.cuda.FloatTensor)) action = worker_path[inside_index] score = F.softmax(torch.squeeze(scores_worker[s:e])) if not score.size() == costs.size(): continue baseline = torch.sum(costs * score) this_cost = torch.log( score[action]) * -1.0 * (reward - baseline) if worker_loss is None: worker_loss = this_cost else: worker_loss += this_cost worker_entropy += torch.sum( score * torch.log(score + 1e-7) ).data.cpu().numpy()[ 0] #+ 0.001*torch.sum(score*torch.log(score+1e-7)) inside_index += 1 worker_loss.backward() torch.nn.utils.clip_grad_norm(worker.parameters(), nnargs["clip"]) optimizer_worker.step() ave_worker_entropy.append(worker_entropy) if len(ave_worker_entropy) >= MAX_AVE: ave_worker_entropy = ave_worker_entropy[1:] entropy_log_worker.log_value( 'entropy', float(sum(ave_worker_entropy)) / float(len(ave_worker_entropy)), new_step) new_step += 1 inside_index = 0 manager_entropy = 0.0 for data in tmp_data: new_step = step rl = data["rl"] ave_reward.append(reward) if len(ave_reward) >= MAX_AVE: ave_reward = ave_reward[1:] reward_log.log_value( 'reward', float(sum(ave_reward)) / float(len(ave_reward)), new_step) scores_manager, representations_manager = get_score_representations( manager, data, dropout=nnargs["dropout_rate"]) #optimizer_manager.zero_grad #manager_loss = None for s, e in zip(rl["starts"], rl["ends"]): #costs = rl['costs'][s:e] #costs = autograd.Variable(torch.from_numpy(costs).type(torch.cuda.FloatTensor)) score = F.softmax(torch.squeeze(scores_manager[s:e])) action = manager_path[inside_index] if not score.size() == costs.size(): continue #baseline = torch.sum(costs*score) #this_cost = torch.log(score[action])*-1.0*(reward-baseline)# + 0.001*torch.sum(score*torch.log(score+1e-7)) #if manager_loss is None: # manager_loss = this_cost #else: # manager_loss += this_cost manager_entropy += torch.sum( score * torch.log(score + 1e-7)).data.cpu().numpy()[0] inside_index += 1 #manager_loss.backward() #torch.nn.utils.clip_grad_norm(manager.parameters(), nnargs["clip"]) #optimizer_manager.step() ave_manager_entropy.append(manager_entropy) if len(ave_manager_entropy) >= MAX_AVE: ave_manager_entropy = ave_manager_entropy[1:] entropy_log_manager.log_value( 'entropy', float(sum(ave_manager_entropy)) / float(len(ave_manager_entropy)), new_step) new_step += 1 step = new_step tmp_data = [] cluster_info = {0: [0]} cluster_list = [0] current_new_cluster = 1 mid = 1 predict_action_embedding = [] choose_action = [] end_time = timeit.default_timer() print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time - start_time) print >> sys.stderr, "save model ..." #print "Top k",top_k print "Worker Hits", statistic[ "worker_hits"], "Manager Hits", statistic[ "manager_hits"], "Total", statistic["total"] print "Worker predict last", statistic[ "worker_predict_last"], "Manager predict last", statistic[ "manager_predict_last"] #torch.save(network_model, model_save_dir+"network_model_rl_worker.%d"%echo) #torch.save(ana_network, model_save_dir+"network_model_rl_manager.%d"%echo) print "DEV" metric = performance.performance(dev_docs_iter, worker, manager) print "Average:", metric["average"] #print "DEV manager" #metric = performance_manager.performance(dev_docs_iter,worker,manager) #print "Average:",metric["average"] print "TEST" metric = performance.performance(test_docs_iter, worker, manager) print "Average:", metric["average"] print sys.stdout.flush()
def write_2thetas(sample, ring, num_vecs, dgamma, x1d, y1d, step_num, fwhm0=10, amp0=500, plot_flag=False): # read in dark image dark_path = sample.data_dir+str(sample.dark_dirs[step_num])+'\\ff\\' dark_file = os.listdir(dark_path) assert len(dark_file) == 1 dark_image = DataReader.ge2_reader(dark_path+dark_file[0]) if len(dark_image.shape) > 1: dark_image = np.mean(dark_image, axis=0) # initialize storage arrays vec_gamma = np.linspace(-np.pi+(dgamma/2), np.pi-(dgamma/2), num=num_vecs) two_theta = np.zeros((sample.n_data_pt, num_vecs)) peak_amps = np.zeros((sample.n_data_pt, num_vecs)) peak_errs = np.zeros((sample.n_data_pt, num_vecs)) # loop through each grid point on sample for i_data_pt in range(sample.n_data_pt): # read image dir_num = sample.init_dirs[step_num] + i_data_pt path = sample.data_dir+str(dir_num)+'\\ff\\' file = os.listdir(path) assert len(file) == 1 print('reading image ' + str(dir_num), 'x = '+str(x1d[i_data_pt]), 'y = '+str(y1d[i_data_pt])) image = DataReader.ge2_reader(path+file[0])[0] # only using first image because of shutter timing error image -= dark_image # subtract dark image # generate coordinates of each pixel and calculate radius and vector angle x, y = np.meshgrid(np.arange(image.shape[1], dtype=float), np.arange(image.shape[0], dtype=float)) x -= sample.true_center[1] y -= sample.true_center[0] radius = np.sqrt( x**2 + y**2 ) # covert x,y coordinates into r,omega coordinates gamma = np.arctan2(y, x) # covert x,y coordinates into r,omega coordinates # loop through each diffraction vector for i_vec in range(num_vecs): # grab slice of detector pixels that are within domega of desired omega img_slice = image[np.abs(gamma-vec_gamma[i_vec]) < dgamma] r_slice = radius[np.abs(gamma-vec_gamma[i_vec]) < dgamma] # grab section of slice that is within dr of ring radius img_slice = img_slice[np.abs(r_slice-ring.radius) < ring.dr] r_slice = r_slice[np.abs(r_slice-ring.radius) < ring.dr] # sort selected pixels values by radial coordinate sorted_indices = np.argsort(r_slice) sorted_r = r_slice[sorted_indices] sorted_peak = img_slice[sorted_indices] # fit peak to sorted selected pixel values ctr_ind, lo_ind, hi_ind = PeakFitting.get_peak_fit_indices(sorted_peak) peak_bg_rm, _ = PeakFitting.RemoveBackground(sorted_r, sorted_peak, sorted_r[lo_ind], sorted_r[hi_ind]) peak_fit, p_opt, err = PeakFitting.fitPeak(sorted_r, peak_bg_rm, sorted_r[ctr_ind], fwhm0, amp0) # calculate 2 theta opp = p_opt[0] adj = sample.detector_dist two_theta[i_data_pt, i_vec] = np.arctan(opp/adj) # store peak amplitude and relative error peak_amps[i_data_pt, i_vec] = p_opt[3] peak_errs[i_data_pt, i_vec] = err if plot_flag: plt.close('all') fig = plt.figure() ax = fig.add_subplot(111) ax.plot(sorted_r, sorted_peak, 'ok') ax.plot(sorted_r, peak_bg_rm, 'or') ax.plot(sorted_r, peak_fit, '-r') ax.text(0.01, 0.92, 'ctr = '+str(opp), transform=ax.transAxes, color='k', fontsize=14) if err < 0.5: ax.text(0.01, 0.85, 'err = '+str(err), transform=ax.transAxes, color='k', fontsize=14) else: ax.text(0.01, 0.85, 'err = '+str(err), transform=ax.transAxes, color='r', fontsize=14) plt.savefig(ring.peak_dir+str(i_data_pt)+'_'+str(vec_gamma[i_vec])+'.png') plt.close('all') if plot_flag: plt.close('all') plt.imshow(image, vmin=0, vmax=200) plt.savefig(ring.peak_dir+str(i_data_pt)+'_image.png') plt.close('all') # write data to a text file out_path = ring.peak_dir+sample.step_names[step_num]+'_peakfit_results.txt' out_file = open(out_path, 'w') for i_data_pt in range(sample.n_data_pt): out_file.write('%24.16f'%x1d[i_data_pt] + '\\t') out_file.write('%24.16f'%y1d[i_data_pt] + '\\t') for i_vec in range(num_vecs): out_file.write('%24.16f'%vec_gamma[i_vec] + '\\t') out_file.write('%24.16f'%two_theta[i_data_pt, i_vec] + '\\t') out_file.write('%24.16f'%peak_amps[i_data_pt, i_vec] + '\\t') out_file.write('%24.16f'%peak_errs[i_data_pt, i_vec] + '\\t') out_file.write('\n') out_file.close()
def write_2thetas(sample, ring, num_vecs, dgamma, x1d, y1d, step_num, fwhm0=10, amp0=500, plot_flag=False): # read in dark image dark_path = sample.data_dir + str(sample.dark_dirs[step_num]) + '\\ff\\' dark_file = os.listdir(dark_path) assert len(dark_file) == 1 dark_image = DataReader.ge2_reader(dark_path + dark_file[0]) if len(dark_image.shape) > 1: dark_image = np.mean(dark_image, axis=0) # initialize storage arrays vec_gamma = np.linspace(-np.pi + (dgamma / 2), np.pi - (dgamma / 2), num=num_vecs) two_theta = np.zeros((sample.n_data_pt, num_vecs)) peak_amps = np.zeros((sample.n_data_pt, num_vecs)) peak_errs = np.zeros((sample.n_data_pt, num_vecs)) # loop through each grid point on sample for i_data_pt in range(sample.n_data_pt): # read image dir_num = sample.init_dirs[step_num] + i_data_pt path = sample.data_dir + str(dir_num) + '\\ff\\' file = os.listdir(path) assert len(file) == 1 print('reading image ' + str(dir_num), 'x = ' + str(x1d[i_data_pt]), 'y = ' + str(y1d[i_data_pt])) image = DataReader.ge2_reader(path + file[0])[ 0] # only using first image because of shutter timing error image -= dark_image # subtract dark image # generate coordinates of each pixel and calculate radius and vector angle x, y = np.meshgrid(np.arange(image.shape[1], dtype=float), np.arange(image.shape[0], dtype=float)) x -= sample.true_center[1] y -= sample.true_center[0] radius = np.sqrt( x**2 + y**2) # covert x,y coordinates into r,omega coordinates gamma = np.arctan2( y, x) # covert x,y coordinates into r,omega coordinates # loop through each diffraction vector for i_vec in range(num_vecs): # grab slice of detector pixels that are within domega of desired omega img_slice = image[np.abs(gamma - vec_gamma[i_vec]) < dgamma] r_slice = radius[np.abs(gamma - vec_gamma[i_vec]) < dgamma] # grab section of slice that is within dr of ring radius img_slice = img_slice[np.abs(r_slice - ring.radius) < ring.dr] r_slice = r_slice[np.abs(r_slice - ring.radius) < ring.dr] # sort selected pixels values by radial coordinate sorted_indices = np.argsort(r_slice) sorted_r = r_slice[sorted_indices] sorted_peak = img_slice[sorted_indices] # fit peak to sorted selected pixel values ctr_ind, lo_ind, hi_ind = PeakFitting.get_peak_fit_indices( sorted_peak) peak_bg_rm, _ = PeakFitting.RemoveBackground( sorted_r, sorted_peak, sorted_r[lo_ind], sorted_r[hi_ind]) peak_fit, p_opt, err = PeakFitting.fitPeak(sorted_r, peak_bg_rm, sorted_r[ctr_ind], fwhm0, amp0) # calculate 2 theta opp = p_opt[0] adj = sample.detector_dist two_theta[i_data_pt, i_vec] = np.arctan(opp / adj) # store peak amplitude and relative error peak_amps[i_data_pt, i_vec] = p_opt[3] peak_errs[i_data_pt, i_vec] = err if plot_flag: plt.close('all') fig = plt.figure() ax = fig.add_subplot(111) ax.plot(sorted_r, sorted_peak, 'ok') ax.plot(sorted_r, peak_bg_rm, 'or') ax.plot(sorted_r, peak_fit, '-r') ax.text(0.01, 0.92, 'ctr = ' + str(opp), transform=ax.transAxes, color='k', fontsize=14) if err < 0.5: ax.text(0.01, 0.85, 'err = ' + str(err), transform=ax.transAxes, color='k', fontsize=14) else: ax.text(0.01, 0.85, 'err = ' + str(err), transform=ax.transAxes, color='r', fontsize=14) plt.savefig(ring.peak_dir + str(i_data_pt) + '_' + str(vec_gamma[i_vec]) + '.png') plt.close('all') if plot_flag: plt.close('all') plt.imshow(image, vmin=0, vmax=200) plt.savefig(ring.peak_dir + str(i_data_pt) + '_image.png') plt.close('all') # write data to a text file out_path = ring.peak_dir + sample.step_names[ step_num] + '_peakfit_results.txt' out_file = open(out_path, 'w') for i_data_pt in range(sample.n_data_pt): out_file.write('%24.16f' % x1d[i_data_pt] + '\\t') out_file.write('%24.16f' % y1d[i_data_pt] + '\\t') for i_vec in range(num_vecs): out_file.write('%24.16f' % vec_gamma[i_vec] + '\\t') out_file.write('%24.16f' % two_theta[i_data_pt, i_vec] + '\\t') out_file.write('%24.16f' % peak_amps[i_data_pt, i_vec] + '\\t') out_file.write('%24.16f' % peak_errs[i_data_pt, i_vec] + '\\t') out_file.write('\n') out_file.close()
za = np.linspace(z_range[1], z_range[0], num=z_num, endpoint=True) x2d, z2d = np.meshgrid(xa, za) if specimen_name == 'al7075_mlf': xa = np.linspace(x_range[0], x_range[1], num=x_num, endpoint=True) za = np.linspace(z_range[0], z_range[1], num=z_num, endpoint=True) x2d, z2d = np.meshgrid(xa, za) x1d, z1d = x2d.flatten(), z2d.flatten() #%% read peak diameters if they have been fit, if not fit here orient = 'h' try: x, z, diams = [], [], [] for i_step in range(sample.n_load_step): txt_data = DataReader.read_data_from_text(ring.out_dir + sample.step_names[i_step] + '_diams_' + orient + '.txt') x.append(txt_data[:, 0]), z.append(txt_data[:, 1]), diams.append(txt_data[:, 2]) except: l_centers, l_errs = np.zeros( (sample.n_load_step, sample.n_data_pt)), np.zeros( (sample.n_load_step, sample.n_data_pt)) u_centers, u_errs = np.zeros( (sample.n_load_step, sample.n_data_pt)), np.zeros( (sample.n_load_step, sample.n_data_pt)) diams = np.zeros((sample.n_load_step, sample.n_data_pt)) for i_step in range(sample.n_load_step): l_centers[i_step, :], l_errs[i_step, :], u_centers[i_step, :], u_errs[
#Nina Renken import numpy as np import matplotlib.pyplot as plt import DataReader as reader import datetime as dt import pandas as pd data_weather = reader.get_weather_data() data_weather_avg = data_weather.groupby('Zeitstempel').mean() data_covid = reader.get_covid_data() data_covid_nds = data_covid[data_covid.Bundesland.eq('Niedersachsen')] data_covid_avg = data_covid_nds.groupby('Meldedatum').sum() data_covid_avg = data_covid_avg.reset_index() # Präparieren der Wetterdaten df_list = [] for row in data_weather_avg.itertuples(): datum = row.Index zeitstempel = dt.datetime.strptime(str(datum), '%Y%m%d').strftime('%Y/%m/%d') df_list.append([zeitstempel, row.Wert]) df_weather = pd.DataFrame(df_list, columns=['Datum', 'Temperatur']) df_weather.plot(x='Datum', y='Temperatur', rot=90) data_covid_avg.plot(x='Meldedatum', y='AnzahlFall', rot=90) data = pd.merge(data_covid_avg, df_weather, left_on='Meldedatum', right_on='Datum')
def main(): DIR = args.DIR embedding_file = args.embedding_dir network_file = "./model/model.pkl" if os.path.isfile(network_file): print >> sys.stderr,"Read model from ./model/model.pkl" network_model = torch.load(network_file) else: embedding_matrix = numpy.load(embedding_file) "Building torch model" network_model = network.Network(pair_feature_dimention,mention_feature_dimention,word_embedding_dimention,span_dimention,1000,embedding_size,embedding_dimention,embedding_matrix).cuda() print >> sys.stderr,"save model ..." torch.save(network_model,network_file) reduced="" if args.reduced == 1: reduced="_reduced" train_docs = DataReader.DataGnerater("train"+reduced) dev_docs = DataReader.DataGnerater("dev"+reduced) test_docs = DataReader.DataGnerater("test"+reduced) l2_lambda = 1e-5 lr = 0.002 dropout_rate = 0.5 shuffle = True times = 0 best_thres = 0.5 model_save_dir = "./model/pretrain/" last_cost = 0.0 for echo in range(30): start_time = timeit.default_timer() print "Pretrain Epoch:",echo optimizer = optim.RMSprop(network_model.parameters(), lr=lr, weight_decay=l2_lambda) cost_this_turn = 0.0 pos_num = 0 neg_num = 0 inside_time = 0.0 loss = None for data,doc_end in train_docs.generater(shuffle): ana_word_index,ana_span,ana_feature,candi_word_index,candi_span,pair_feature_array,target,mention_ids = data if len(pair_feature_array) >= 500: continue if len(target) == 0: continue mention_index = autograd.Variable(torch.from_numpy(ana_word_index).type(torch.cuda.LongTensor)) mention_span = autograd.Variable(torch.from_numpy(ana_span).type(torch.cuda.FloatTensor)) mention_feature = autograd.Variable(torch.from_numpy(ana_feature).type(torch.cuda.FloatTensor)) candi_index = autograd.Variable(torch.from_numpy(candi_word_index).type(torch.cuda.LongTensor)) candi_spans = autograd.Variable(torch.from_numpy(candi_span).type(torch.cuda.FloatTensor)) pair_feature = autograd.Variable(torch.from_numpy(pair_feature_array).type(torch.cuda.FloatTensor)) gold = [0] + target.tolist() if sum(target) == 0: neg_num += 1 gold[0] = 1 else: pos_num += 1 inside_time_start = timeit.default_timer() lable = autograd.Variable(torch.cuda.FloatTensor([gold])) output,scores = network_model.forward(word_embedding_dimention,mention_index,mention_span,mention_feature,mention_index,mention_span,candi_index,candi_spans,pair_feature,dropout_rate) optimizer.zero_grad() loss = F.binary_cross_entropy(output,lable) loss.backward() optimizer.step() inside_time += (timeit.default_timer()-inside_time_start) cost_this_turn += loss.data[0] end_time = timeit.default_timer() print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time) print >> sys.stderr, "Inside Use %.3f seconds"%(inside_time) print >> sys.stderr, "Neg:Pos",neg_num,pos_num print >> sys.stderr, "Learning Rate",lr if cost_this_turn > last_cost: lr = lr*0.7 last_cost = cost_this_turn print >> sys.stderr,"save model ..." best_thres = Evaluate.evaluate(network_model,dev_docs,best_thres)
import tensorflow as tf from CellSeg_CNN import * import numpy as np import DataReader #Reading the images data_reader = DataReader.DataReader() input_reader = data_reader.input_reader training_images = data_reader.training_images if (input_reader.use_data_rotation): rotated_images = data_reader.pi_half_rotated_images number_of_training_images = np.size(training_images, axis=0) image_height = np.size(training_images, axis=1) image_width = np.size(training_images, axis=2) test_images = data_reader.test_images number_of_test_images = np.size(test_images, axis=0) #Reading the ground truth classes [training_classes, training_defined_samples] = data_reader.training_classes if (input_reader.use_data_rotation): [rotated_classes, rotated_defined_mask] = data_reader.pi_half_rotated_classes_and_masks [test_classes, test_defined_samples] = data_reader.test_classes #Reading parameters learning_rate = input_reader.learning_rate regularisation_param = tf.constant(input_reader.regularisation_parameter) n_epochs = input_reader.number_of_epochs tensorboard_file_location = input_reader.tensorboard_location input_patch_width = input_reader.input_patch_width
def print_dict_for_line_chart(d): # sort dict sorted_keys = sorted(d) print "--- period ---" for key in sorted_keys: print "'%s'," % key print "--- amounts ---" for key in sorted_keys: amount_for_key = d[key] / MILLION print "'%s'," % amount_for_key if __name__ == "__main__": fundings = DataReader.read_funding_data(funding_data) startups_per_state = {} total_funding_per_state = {} states_funding_per_company = {} funding_by_year = {} funding_by_month = {} get_fundings_per_year() get_fundings_per_month() get_total_funding_per_state() get_fundings_per_startup_per_state() print_dict_to_geochart(startups_per_state) print_dict_to_geochart(total_funding_per_state) print_dict_for_line_chart(funding_by_year) print_dict_for_line_chart(funding_by_month)