def plotDaily( filename = None, US = True, places = util.TEST_STATES, cases = True, day = util.TEST_DATE, dark_mode = True ): column = 'Province_State' if US else 'Country/Region' df = util.loadData(US=US, cases=cases).groupby(column).sum().reset_index() if dark_mode: plt.style.use('dark_background') colors = plt.cm.Reds(np.linspace(0.35, 0.65, len(places))) values = [] for place in places: cumulative_data = df[df[column] == place] start_column = cumulative_data.columns.get_loc(util.START_DATE) # convert total counts to daily counts counts = cumulative_data.iloc[:, start_column:].diff(axis=1) values.append(int(counts[day])) plt.bar(places, values, color=colors) label = 'Cases' if cases else 'Deaths' plt.title(f'{label}, {day}') plt.ylabel(f'{label}') filename = filename if filename else f"{label}_{day.replace('/', '-')}.png" plt.savefig(filename) plt.close()
def __init__(self): self.F1 = [] self.best = 0. self.best_ment = 0. data = loadData(path + '/' + "data/train_pre.json") self.test_x = [i['text'] for i in data[int(len(data) * 0.8):]] self.test_y = [[(ment['mention'], ment['offset']) for ment in ments['mention_data'] if ment['kb_id'] != 'NIL'] for ments in data[int(len(data) * 0.8):]]
def trainBatch(net, criterion, optimizer): data = train_iter.next() cpu_images, cpu_texts = data #print (type(cpu_texts), cpu_texts) batch_size = cpu_images.size(0) util.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) util.loadData(text, t) util.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size crnn.zero_grad() # optimizer.zero_grad() cost.backward() # optimizer.step() # torch.nn.utils.clip_grad_norm(crnn.parameters(), 5) # for p in crnn.parameters(): # p.data.add(-opt.lr, p.grad.data) # for w in crnn.parameters(): #w.grad.data.clamp_(-5,5) optimizer.step() return cost
def plotTimeSeries( filename = None, US = True, places = util.TEST_STATES, cases = True, num_days = 7, end_date = None, dark_mode = True ): column = 'Province_State' if US else 'Country/Region' df = util.loadData(US=US, cases=cases).groupby(column).sum().reset_index() if dark_mode: plt.style.use('dark_background') colors = plt.cm.Oranges(np.linspace(0.35, 0.65, len(places))) offset = getOffset(df, end_date) if end_date else 0 x_values = None for index, place in enumerate(places): cumulative_data = df[df[column] == place] start_column = cumulative_data.columns.get_loc(util.START_DATE) # convert total counts to daily counts counts = cumulative_data.iloc[:, start_column:].diff(axis=1) x_values = list(counts.columns[-(num_days + offset):len(counts.columns) - offset]) y_values = [int(counts[col]) for col in x_values] plt.plot(x_values, y_values, label=place, color=colors[index], linewidth=2) label = 'Cases' if cases else 'Deaths' plt.title(f'Daily {label}, Last {num_days} Days') # control the number of date tick marks skip = max(num_days // 5, 1) plt.xticks(x_values[::skip]) plt.xlabel('Date') plt.ylabel(f'{label}') plt.legend() filename = filename if filename else f'{label}_last_{num_days}.png' plt.savefig(filename) plt.close()
def plotCaseMap( filename = None, US = True, day = util.TEST_DATE, dark_mode = True ): df = util.loadData(US=US) dates = list(df.columns) column = 'Province_State' if US else 'Country/Region' df = df.groupby(column)[dates].agg('sum') df['Cases'] = df.diff(axis=1)[day] if US: df['State'] = [util.STATE_TO_ABBREV.get(x, None) for x in list(df.index)] else: df['Country'] = df.index global_scopes = ['world', 'europe', 'africa', 'asia', 'south america', 'north america'] fig = px.choropleth( df, locations = 'State' if US else 'Country', locationmode = 'USA-states' if US else 'country names', scope = 'usa' if US else global_scopes[0], color = 'Cases', hover_name = 'State' if US else 'Country', # projection = 'miller', color_continuous_scale = 'Peach', template = 'plotly_dark' if dark_mode else None, title = f"{'US' if US else 'Global'} Daily Cases, {day}", width = 1000, # height = 500, range_color = [0,3000] ) if filename is None: filename = 'usa_chart.png' if US else 'global_chart.png' fig.update_layout(margin={'l': 0, 'r': 0, 't': 70, 'b': 20}, title={'font': {'size': 20}, 'x':0.5}) fig.write_image(filename, engine='kaleido')
def runTextModelEval(textModelName = [], PATH = '../model/doc2vec/'): ''' Given a list of existed Text Model name, load them and get the baseline results one by one. Baseline evaluation please see baseline_classification.py @param: textModelName, a list of TextModel name PATH, the path to the model TextModel folder, default set to be ../model/doc2vec/ @return: null ''' [all_data, train_size, test_size, train_x, train_y, test_x] = util.loadData() sentences = util.data_preprocess(all_data) svd = TruncatedSVD(n_components=GENE_INPUT_DIM, random_state=12) for textModel in textModelName: try: model = wel.loadTextModel(PATH + textModel) except: print('Failed on ' + textModel) continue if model == None: print('Failed on ' + textModel) continue text_train_arrays, text_test_arrays = wel.getTextVec(model, train_size, test_size, 200) truncated_one_hot_gene = wel.getGeneVec(all_data, svd) truncated_one_hot_variation = wel.getVariationVec(all_data, svd) train_set = np.hstack((truncated_one_hot_gene[:train_size], truncated_one_hot_variation[:train_size], text_train_arrays)) test_set = np.hstack((truncated_one_hot_gene[train_size:], truncated_one_hot_variation[train_size:], text_test_arrays)) encoded_y = pd.get_dummies(train_y) encoded_y = np.array(encoded_y) X = np.array(train_set) y = np.array(bc.getLabels(encoded_y)) print('Results for TextModel: ' + textModel) cm = bc.baseline(X, y)
def train(self): train_x, train_y, test_x, test_y, trainIndexs, testIndexs = loadData() epoch = 0 EPOCHS = 20000 while epoch <= EPOCHS: batch_image, batch_text = getTrainBatch(64) _, loss = self.sess.run( [self.train_step, self.cross_entropy], feed_dict={ self._x: batch_image, self._y: batch_text, self._prob: 0.75 }) print('epoch', epoch, '/loss', loss) if epoch % 100 == 0: test_batch_image, test_batch_text = getTestBatch(100) accu = self.accuracy.eval( feed_dict={ self._x: test_batch_image, self._y: test_batch_text, self._prob: 1 }) s = self.merged_summary.eval( feed_dict={ self._x: test_batch_image, self._y: test_batch_text, self._prob: 1 }) self.writer.add_summary(s, epoch) print('epoch', epoch, '/accuracy', accu) if accu >= 1.0: break epoch += 1 self.saver.save(self.sess, model_path + '/', global_step=epoch)
class OurTokenizer(Tokenizer): def _tokenize(self, text): R = [] for c in text: if c in self._token_dict: R.append(c) elif self._is_space(c): R.append('[unused1]') # space类用未经训练的[unused1]表示 else: R.append('[UNK]') # 剩余的字符是[UNK] return R tokenizer = OurTokenizer(token_dict) char_size = 512 # 768 data = loadData(path + '/' + 'data/train_pre.json') # data=[ for line in data] train_data = data[:int(len(data) * 0.8)] valid_data = data[int(len(data) * 0.8):] dataByAlias, dataBySubjectId = loadDataBase(path + '/' + 'data/kb_data') def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ])
def extractLocalFeature(self): success_list, failure_list = util.getSubjectFileList( self.record_root_path, [self.subject], self.task) # Divide it into training and test set # ------------------------------------------------------------- # ------------------------------------------------------------- # loading and time-sync d = util.loadData(success_list) force_array = None for idx in xrange(len(d['timesList'])): if force_array is None: force_array = d['ftForceList'][idx] else: force_array = np.hstack([force_array, d['ftForceList'][idx]]) from sklearn.decomposition import PCA pca = PCA(n_components=1) res = pca.fit_transform(force_array.T) # ------------------------------------------------------------- # loading and time-sync d = util.loadData(failure_list) # extract local features r = 0.25 for idx in xrange(len(d['timesList'])): timeList = d['timesList'][idx] audioAzimuth = d['audioAzimuthList'][idx] audioPower = d['audioPowerList'][idx] kinEEPos = d['kinEEPosList'][idx] kinEEQuat = d['kinEEQuatList'][idx] kinEEPos = d['kinEEPosList'][idx] kinEEQuat = d['kinEEQuatList'][idx] ftForce = d['ftForceList'][idx] kinTargetPos = d['kinTargetPosList'][idx] kinTargetQuat = d['kinTargetQuatList'][idx] # Unimoda feature - Audio -------------------------------------------- unimodal_audioPower = [] for time_idx in xrange(len(timeList)): ang_max, ang_min = self.getAngularSpatialRF( kinEEPos[:, time_idx], r) if audioAzimuth[time_idx] > ang_min and audioAzimuth[ time_idx] < ang_max: unimodal_audioPower.append(audioPower[time_idx]) else: unimodal_audioPower.append( power_min) # or append white noise? ## power_max = np.amax(d['audioPowerList']) ## power_min = np.amin(d['audioPowerList']) ## self.audio_disp(timeList, audioAzimuth, audioPower, audioPowerLocal, \ ## power_min=power_min, power_max=power_max) # Unimodal feature - Kinematics -------------------------------------- unimodal_kinVel = [] # Unimodal feature - Force ------------------------------------------- # ftForceLocal = np.linalg.norm(ftForce, axis=0) #* np.sign(ftForce[2]) unimodal_ftForce = pca.transform(ftForce.T).T ## self.ft_disp(timeList, ftForce, ftForceLocal) # Crossmodal feature - relative dist, angle -------------------------- crossmodal_relativeDist = np.linalg.norm(kinTargetPos - kinEEPos, axis=0) crossmodal_relativeAng = [] for time_idx in xrange(len(timeList)): startQuat = kinEEQuat[:, time_idx] endQuat = kinTargetQuat[:, time_idx] diff_ang = qt.quat_angle(startQuat, endQuat) crossmodal_relativeAng.append(abs(diff_ang))
@File : test.py @Time : 2019/5/27 8:32 @Author : Blue Keroro """ import sys import os curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) from util import loadDataBase, loadData import json if __name__ == '__main__': cnt = 0 data = loadData('data/train.json') delete = list() for sub in data: map = [0 for i in sub['text']] length = 0 for ment in sub['mention_data']: length += len(ment['mention']) for index in range(int(ment['offset']), int(ment['offset']) + len(ment['mention'])): map[index] = 1 for i in map: if i == 1: length -= 1 if length != 0: print(sub) delete.append(sub) cnt += 1
def val(net, test_dataset, criterion, max_iter=100): print('Start val') for p in crnn.parameters(): p.requires_grad = False # layer_dict = net.state_dict() # print(layer_dict['cnn.conv1.weight']) net.eval() data_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=opt.batchSize, num_workers=int(opt.workers), collate_fn=dataset.alignCollate( imgH=32, imgW=100, keep_ratio=True)) val_iter = iter(data_loader) i = 0 n = 0 n_correct = 0 n_text = 0 loss_avg = util.averager() max_iter = len(data_loader) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) util.loadData(image, cpu_images) t, l = converter.encode(cpu_texts) util.loadData(text, t) util.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg.add(cost) _, preds = preds.max(2) #preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) # print (preds) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if isinstance(target, unicode) is False: target = target.decode('utf-8') pred_encode, _ = converter.encode(pred) target_encode, _ = converter.encode(target) t = editdistance.eval(pred_encode, target_encode) l = len(target_encode) n_correct += t n_text += l n += 1 raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, sim_pred, gt in zip(raw_preds, sim_preds, cpu_texts): gt = gt.lower() print('%-20s => %-20s, gt: %-20s' % (raw_pred, sim_pred, gt)) len_edit = n_correct / float(n) len_text = n_text / float(n) norm = 1 - len_edit / len_text print('editd dist: %f, norm acc: %f' % (n_correct, norm))
def loadEvents(self, path): self.events = util.loadData(filename=path) self.all_tracks = alltracks = pandas.concat(self.events, ignore_index=True)
@author: zz """ #Input: XX.X #Random Weights: X.X #Activation: X.X #Linear Weights: X.X (trained in real value) # error rate: 7.9 % [0.1 MNIST lite] from util import loadData, normalizeData, around from FPGA_RF_CIW_ELM import FPGA_RF_CIW_ELM import numpy as np prec = 3 train_data, train_label, test_data, test_label = loadData(0.1) feature_dim = train_data.shape[1] label_dim = train_label.shape[1] train_data = normalizeData(train_data)#/5000#int(feature_dim*1) test_data = normalizeData(test_data)#/5000#int(feature_dim*1) train_label[train_label==1] = 250 test_label[test_label==1] = 250 train_data = around(train_data, N_bits=prec) test_data = around(test_data, N_bits=prec) fpga_elm = FPGA_RF_CIW_ELM(28, 28, feature_dim*10, label_dim, 'lite', 'rf-ciw', train_data, train_label, \ H=0.25, randomPrec=3, actPrec=3, linearPrec=3, callPrec=3, fixedTrain = False, fixedTest=True) print "Training data max dim:", np.max(train_data) print "Training data min dim:", np.min(train_data)
#szs = [] #szs.append(sz) preds_size = Variable(torch.IntTensor([preds.size(0)])) tmp = nm.split('.')[0] tmp2 = tmp + '.txt' lex_path = lex_dir + tmp2 txt = open(lex_path).read() wds = txt.splitlines() len_lexicon = len(wds) lex_pred = [] lexicon = [] for wd in wds: lexicon.append(wd) t, l = converter.encode(wd) util.loadData(text, t) util.loadData(length, l) cost = criterion(preds, text, preds_size, length) tmp4 = cost.data[0] lex_pred.append(tmp4) idx = lex_pred.index(min(lex_pred)) finnal_pred = lexicon[idx] print('%-20s => %-20s' % (finnal_pred, lexicon[0])) _, preds = preds.max(2) preds = preds.transpose(1, 0).contiguous().view(-1) preds_size = Variable(torch.IntTensor([preds.size(0)])) raw_pred = converter.decode(preds.data, preds_size.data, raw=True) sim_pred = converter.decode(preds.data, preds_size.data, raw=False)
def extractLocalFeature(self): success_list, failure_list = util.getSubjectFileList(self.record_root_path, [self.subject], self.task) # Divide it into training and test set # ------------------------------------------------------------- # ------------------------------------------------------------- # loading and time-sync d = util.loadData(success_list) force_array = None for idx in xrange(len(d['timesList'])): if force_array is None: force_array = d['ftForceList'][idx] else: force_array = np.hstack([force_array, d['ftForceList'][idx] ]) from sklearn.decomposition import PCA pca = PCA(n_components=1) res = pca.fit_transform( force_array.T ) # ------------------------------------------------------------- # loading and time-sync d = util.loadData(failure_list) # extract local features r = 0.25 for idx in xrange(len(d['timesList'])): timeList = d['timesList'][idx] audioAzimuth = d['audioAzimuthList'][idx] audioPower = d['audioPowerList'][idx] kinEEPos = d['kinEEPosList'][idx] kinEEQuat = d['kinEEQuatList'][idx] kinEEPos = d['kinEEPosList'][idx] kinEEQuat = d['kinEEQuatList'][idx] ftForce = d['ftForceList'][idx] kinTargetPos = d['kinTargetPosList'][idx] kinTargetQuat = d['kinTargetQuatList'][idx] # Unimoda feature - Audio -------------------------------------------- unimodal_audioPower = [] for time_idx in xrange(len(timeList)): ang_max, ang_min = self.getAngularSpatialRF(kinEEPos[:,time_idx], r) if audioAzimuth[time_idx] > ang_min and audioAzimuth[time_idx] < ang_max: unimodal_audioPower.append(audioPower[time_idx]) else: unimodal_audioPower.append(power_min) # or append white noise? ## power_max = np.amax(d['audioPowerList']) ## power_min = np.amin(d['audioPowerList']) ## self.audio_disp(timeList, audioAzimuth, audioPower, audioPowerLocal, \ ## power_min=power_min, power_max=power_max) # Unimodal feature - Kinematics -------------------------------------- unimodal_kinVel = [] # Unimodal feature - Force ------------------------------------------- # ftForceLocal = np.linalg.norm(ftForce, axis=0) #* np.sign(ftForce[2]) unimodal_ftForce = pca.transform(ftForce.T).T ## self.ft_disp(timeList, ftForce, ftForceLocal) # Crossmodal feature - relative dist, angle -------------------------- crossmodal_relativeDist = np.linalg.norm(kinTargetPos - kinEEPos, axis=0) crossmodal_relativeAng = [] for time_idx in xrange(len(timeList)): startQuat = kinEEQuat[:,time_idx] endQuat = kinTargetQuat[:,time_idx] diff_ang = qt.quat_angle(startQuat, endQuat) crossmodal_relativeAng.append( abs(diff_ang) )
import layer from speakerReg import speakerReg TrainDataPath = '../../vcc2016/TFrecords/Time/Train/' TestDataPath = '../../vcc2016/TFrecords/Time/Test/' dataSize = 513 latentSize = 64 speakerN = 10 N = 500 L = 80 tstep = 100 hidNum = 1000 lamb = 0 tS = time.time() trainData, Label = loadData(TrainDataPath, L, tstep) # testData = loadData(TestDataPath) tE = time.time() print("loading data time: %f" % (tE-tS)) CGNNarch = {'channel' : N, 'kernel': [1, L], 'stride': [1,1]} Regarch = {'channel' : [16, 32], 'kernel': [[1, 512], [1, 3]], 'stride': [[1,250], [1,2]], 'speaker_dim': speakerN} source = tf.placeholder(tf.float32, shape = [None, tstep*L]) label = tf.placeholder(tf.float32, shape = [None, speakerN]) latent = tf.placeholder(tf.float32, shape = [None, N]) RegNet_en = speakerReg(Regarch, 'RegNet_en') RegNet_de = speakerReg(Regarch, 'RegNet_de') x = tf.reshape(source, [-1, tstep, L, 1]) GCNN_en1 = layer.gatedCNN(x, CGNNarch, 'GCNN_en1')
parser.add_argument('-train',type=str,help="-train dataset.csv path") parser.add_argument('-run',type=str,help="-run dataset.csv path") parser.add_argument('-model',type=str,help='-model model\'s path') parser.add_argument('-iterations',type=int,help='-iteration number of epoches') parser.add_argument('-finetune',type=str,help='-finetune base-model path') args = parser.parse_args() print(args) #Assembling Net: buildNet() #data loading: file_name = args.run if args.run is not None else args.train print("Loading data...",end="") d = open(file_name,'r') data,labels = util.loadData(d) data = util.reduceMatRows(data) labels,m1,m2 =util.reduceVector(labels,getVal=True) print("{} chunk loaded!\n".format(len(labels)),end="") if args.run is not None: #Loading weights w_name = args.model net.load_weights(w_name) epochs = "run" print("Starting main loop...") hip = 0 reals,preds = [],[] for i in range(len(data)-40,len(data)): x = np.array(data[i]).reshape(1,12)
if (i + 1) % 20 == 0: pred = np.concatenate(train_pred, axis=0) label = np.concatenate(train_label, axis=0) train_mae, train_rmse, train_mape, b = util.metric(pred, label) print("[epoch %d][%d/%d] loss: %.4f mae: %.4f rmse: %.4f " % (epoch, i + 1, len(train_loader), loss.item(), train_mae, train_rmse)) train_mae, train_rmse, train_mape, b = util.metric(train_pred, train_label) return train_rmse, sum(epoch_loss) if __name__ == '__main__': train_, val_, test_, test_time, A, mean, std, index = util.loadData(args) print(args) #train_loader train_loader = DataLoader( dataset=train_, batch_size=args.batch_size, ) val_loader = DataLoader( dataset=val_, batch_size=args.batch_size, ) test_loader = DataLoader( dataset=test_,
def train( model, optimizer : optim, word_vocab: Vocabulary, char_vocab: Vocabulary, tag_vocab : Vocabulary, args, ): epoch_size = args.epoch_size batch_size = args.batch_size train_words, train_tags = loadData(args.train_path, word_vocab, tag_vocab, args.delimiter) dev_words, dev_tags = loadData(args.dev_path, word_vocab, tag_vocab, args.delimiter) train_size = len(train_words) device = torch.device(model._device) print(model) model.f1 = -1 p_all = [] r_all = [] f1_all = [] for epoch in range(1, epoch_size + 1): indexes = np.random.permutation(train_size) epoch_loss = 0. model.train() for batch_i in range(0, train_size, batch_size): idx = indexes[batch_i:batch_i + batch_size] # prepare minibatch batch_words, batch_word_lens, batch_word_mask, batch_chars, batch_char_lens, batch_tags = getMinibatch( [train_words[i] for i in idx], [train_tags[i] for i in idx], word_vocab, char_vocab, tag_vocab, device, ) """ for i, words in enumerate(batch_words.tolist()[:3]): print(" ".join(word_vocab.toTokens(words)[:batch_word_lens[i]])) cs = [[] for _ in range(3)] for j in range(len(batch_chars)): for i in range(3): cs[i] += ["".join(char_vocab.toTokens(batch_chars[j][i].tolist())[:batch_char_lens[j][i]])] for i in range(3): print(" ".join(cs[i])) """ f_start = time.time() # forward loss = model( batch_words, batch_chars, batch_tags, tag_vocab, batch_word_mask, batch_word_lens, batch_char_lens, ) f_end = time.time() b_start = time.time() # backward and update parameters optimizer.zero_grad() loss.backward() if model.clipping is not None and model.clipping > 0: nn.utils.clip_grad_norm_(model.parameters(), model.clipping) optimizer.step() b_end = time.time() epoch_loss += loss.tolist() print("epoch: {:>3d}, batch: {:>4d}, loss: {:10.4f}, forward: {: 2.2f}, backward: {: 2.2f}".format( epoch, batch_i // batch_size + 1, loss.tolist(), f_end - f_start, b_end - b_start, )) print("finished epoch: {}, epoch loss: {}\n".format( epoch, epoch_loss, )) model.eval() # calc accuracy on dev p, r, f1 = calcAccuracy( model, word_vocab, char_vocab, tag_vocab, dev_words, dev_tags, ) p_all += [p] r_all += [r] f1_all += [f1] # if the optimizer is SGD, # then scheduling the initial learning rate by # lr = initial_lr / ( 1.0 + 0.05 * epoch_number ) if "SGD" in optimizer.__str__(): lr = model.lr / (1.0 + 0.05 * epoch) for param_group in optimizer.param_groups: param_group["lr"] = lr print("set the learning rate of {}\n".format(lr)) # save the model parameters if model.f1 < f1 and args.model_path is not None: torch.save( model.state_dict(), args.model_path, ) model.f1 = max(model.f1, f1) print("best f1-score: {}".format(model.f1)) """
def main(dataFile): color, depth, labels, people = util.loadData(dataFile) plt.imshow(np.reshape(depth[0, :], (128, 128))) n, colorFeatures = color.shape _, depthFeatures = depth.shape color, depth, labels, people = shuffle(color, depth, labels, people, random_state=0) XColorAndDepth = np.concatenate((color, depth), axis=1) XDepth = depth XColor = color XShrunkDepth = util.resizeImages(128, 0.2, depth) print XShrunkDepth.shape y = labels XTrainColor, XTestColor, yTrainColor, yTestColor, peopleTrainColor, peopleTestColor = util.leaveOnePersonOut( 3, XColor, y, people) XTrainDepth, XTestDepth, yTrainDepth, yTestDepth, peopleTrainDepth, peopleTestDepth = util.leaveOnePersonOut( 3, XDepth, y, people) XTrainColorAndDepth, XTestColorAndDepth, yTrainColorAndDepth, yTestColorAndDepth, peopleTrainColorAndDepth, peopleTestColorAndDepth = util.leaveOnePersonOut( 3, XColorAndDepth, y, people) XTrainShrunkDepth, XTestShrunkDepth, yTrainShrunkDepth, yTestShrunkDepth, peopleTrainShrunkDepth, peopleTestShrunkDepth = util.leaveOnePersonOut( 3, XShrunkDepth, y, people) # print "Selecting linear parameters for just color" # c = selectParamLinear(XTrainColor, yTrainColor, peopleTrainColor) # clf = SVC(kernel='linear', C=c) # clf.fit(XTrainColor, yTrainColor) # yPred = clf.predict(XTestColor) # score = metrics.accuracy_score(yTestColor, yPred) # print "Selected C = " + str(c) + ", accuracy = " + str(score) # print "Selecting linear parameters for just depth" # c = selectParamLinear(XTrainDepth, yTrainDepth, peopleTrainDepth) # clf = SVC(kernel='linear', C=c) # clf.fit(XTrainDepth, yTrainDepth) # yPred = clf.predict(XTestDepth) # score = metrics.accuracy_score(yTestDepth, yPred) # print "Selected C = " + str(c) + ", accuracy = " + str(score) # print "Selecting linear parameters for color and depth" # c = selectParamLinear(XTrainColorAndDepth, yTrainColorAndDepth, peopleTrainColorAndDepth) # clf = SVC(kernel='linear', C=c) # clf.fit(XTrainColorAndDepth, yTrainColorAndDepth) # yPred = clf.predict(XTestColorAndDepth) # score = metrics.accuracy_score(yTestColorAndDepth, yPred) # print "Selected C = " + str(c) + ", accuracy = " + str(score) # print "Selecting rbf parameters for just color" # gamma, c = selectParamRBF(XTrainColor, yTrainColor, peopleTrainColor) # clf = SVC(kernel='rbf', C=c, gamma=gamma) # clf.fit(XTrainColor, yTrainColor) # yPred = clf.predict(XTestColor) # score = metrics.accuracy_score(yTestColor, yPred) # print "Selected C = " + str(c) + ", gamma = " + str(gamma) + ", accuracy = " + str(score) # print "Selecting rbf parameters for just depth" # gamma, c = selectParamRBF(XTrainDepth, yTrainDepth, peopleTrainDepth) # clf = SVC(kernel='rbf', C=c, gamma=gamma) # clf.fit(XTrainDepth, yTrainDepth) # yPred = clf.predict(XTestDepth) # score = metrics.accuracy_score(yTestDepth, yPred) # print "Selected C = " + str(c) + ", gamma = " + str(gamma) + ", accuracy = " + str(score) print "Selecting rbf parameters for just shrunken depth" gamma, c = selectParamRBF(XTrainShrunkDepth, yTrainShrunkDepth, peopleTrainShrunkDepth) clf = SVC(kernel='rbf', C=c, gamma=gamma) clf.fit(XTrainShrunkDepth, yTrainShrunkDepth) yPred = clf.predict(XTestShrunkDepth) score = metrics.accuracy_score(yTestShrunkDepth, yPred) print "Selected C = " + str(c) + ", gamma = " + str( gamma) + ", accuracy = " + str(score)
else: print(" /**testation info**/") print("----avarage test loss:", self.test_loss) print("PW:") print("----avarage accuracy:", self.test_accuracy_pw) # print("----avarage f1-Score of N:", self.test_f1_pw[0]) print("----avarage f1-Score of B:", self.test_f1_pw[1]) print("PPH:") print("----avarage accuracy :", self.test_accuracy_pph) # print("----avarage f1-Score of N:", self.test_f1_pph[0]) print("----avarage f1-Score of B:", self.test_f1_pph[1]) # print("IPH:") # print("----avarage accuracy:", self.test_accuracy_iph) # print("----avarage f1-Score of N:", self.test_f1_1_iph) # print("----avarage f1-Score of B:", self.test_f1_2_iph) # train && test if __name__ == "__main__": # 读数据 print("Loading Data...") X_train, y_train, len_train, pos_train, length_train, position_train, \ X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, \ X_test, y_test, len_test, pos_test, length_test, position_test=util.loadData() # print("Run Model...\n\n\n") model = BiLSTM() model.fit(X_train, y_train, len_train, pos_train, length_train, position_train, X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, X_test, y_test, len_test, pos_test, length_test, position_test, "test", False)
def SGD(self, X, Y, startLearningRate, miniBatchFraction, epoch, keepProb): """ 使用梯度下降法训练模型 Parameters ---------- X : np.array 自变量. Y : np.array 因变量. startLearningRate : TYPE DESCRIPTION. miniBatchFraction : TYPE DESCRIPTION. epoch : TYPE DESCRIPTION. keepProb : TYPE DESCRIPTION. Returns ------- None. """ summary = tf.summary.merge_all() trainStep = tf.Variable(0) learningRate = tf.train.exponential_decay(startLearningRate, trainStep, 1000, 0.96, staircase=True) method = tf.train.GradientDescentOptimizer(learningRate) optimizer = method.minimize(self.loss, global_step=trainStep) batchSize = int(X.shape[0] * miniBatchFraction) batchNum = int(np.ceil(1 / miniBatchFraction)) sess = tf.Session() self.sess = sess init = tf.global_variables_initializer() sess.run(init) summary_writer = tf.summary.FileWriter(self.logPath, graph=tf.get_default_graph()) step = 0 while (step < epoch): for i in range(batchNum): batchX = X[i * batchSize:(i + 1) * batchSize] batchY = Y[i * batchSize:(i + 1) * batchSize] sess.run( [optimizer], feed_dict={ self.input: batchX, self.label: batchY, self.keepProb: keepProb }) step += 1 #评估模型效果,将日志写入文件 self.evaluation(step) summary_str = sess.run(summary, feed_dict={ self.input: X, self.label: Y, self.keepProb: 1.0 }) summary_writer.add_summary(summary_str, step) summary_writer.flush() return self def fit(self, startLearningRate=0.1, miniBatchFraction=0.01, epoch=200, keepProb=0.7): """ 训练模型 """ X = self.trainSet["X"] Y = self.trainSet["Y"] self.input = tf.placeholder(tf.float32, shape=[None, X.shape[1]], name="X") self.label = tf.placeholder(tf.int64, shape=[None, self.size[-1]], name="Y") self.keepProb = tf.placeholder(tf.float32) self.defineANN() self.defineLoss() self.SGD(X, Y, startLearningRate, miniBatchFraction, epoch, keepProb) def predict_proba(self, X): """ 使用神经网络对未知数据进行预测 Parameters ---------- X : TYPE DESCRIPTION. Returns ------- None. """ sess = self.sess pred = tf.nn.softmax(logits=self.out, name="pred") prob = sess.run(pred, feed_dict={ self.input: X, self.keepProb: 1.0 }) return prob if __name__ == "__main__": data = loadData() trainData, validationData, trainLabel, validationLabel = train_test_split( data[0], data[1], test_size=0.3, random_state=1001) trainSet = {"X": trainData, "Y": trainLabel} validationSet = {"X": validationData, "Y": validationLabel} testSet = {"X": data[2], "Y": data[3]} #windows与Linu的储存路径不同 if os.name == "nt": ann = ANN([30, 20, 10], "logs\\mnist", trainSet, validationSet, testSet) else: ann = ANN([30, 20, 10], "logs/mnist", trainSet, validationSet, testSet) ann.fit()
from util import loadData from biterm import Biterm import numpy as np import preprocess, time, pickle #file_name = '../Data/testdata.manualSUBSET.2009.06.14.csv' #file_name = '../Data/training.1600000.processed.noemoticon.csv' file_name = '../Data/train-PROCESSED-FINAL.csv' #file_name = '../Data/train-PROCESSED.csv' tweets = loadData(file_name) tweets = preprocess.splitWords(tweets) tweets = tweets number_of_topics = 100 a = 100.0/number_of_topics b = 0.001 max_iter = 200 mdl = 16 bt = Biterm(a,b,number_of_topics,max_iter,mdl); start = time.time() bt.fit(tweets) end = time.time() print end - start bt.showTopics(10) [phi, theta] = bt.getParams() file_name = "model" + str(mdl) + ".pkl" pickle.dump( bt, open( file_name, "wb" ) )
data = [] with open(path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() line = eval(line) line = [str(i) for i in line] data.append(line) return data def loadDevData(path): data = loadData(path) ret = [] for sub in data: ret.append(list(sub['text'])) return ret if __name__ == '__main__': labels = ['O', 'B-ment', 'I-ment'] data = loadData('data/train_pre.json') with open('data/train_text.txt', 'w', encoding='utf-8') as file: for line in tqdm(data): line = reduce(line, labels) file.write('\n'.join(line)) file.write('\n\n') # train_x, train_y = get_train_data('data/train_text.txt') # print(train_x) # print(train_y) # print(predict_reduce(train_x[0], train_y[0],labels))
from util import loadData from biterm import Biterm import numpy as np import preprocess, time, pickle, sys import unicodecsv as csv #file_name = '../Data/testdata.manualSUBSET.2009.06.14.csv' # file_name = '../Data/training.1600000.processed.noemoticon.csv' file_name = '../Data/train-AGGREGATED.csv' out_file_name = '../Data/train-PROCESSED-FINAL.csv' tweets = loadData(file_name) if len(sys.argv) > 1: tweets = tweets[0:int(sys.argv[1])] # tweets = [tweets[i] for i in np.random.permutation(len(tweets))] print 'Starting preprocessing' tweets_dict = preprocess.preprocess(tweets) max_idx = max(tweets_dict.keys()) with open(file_name, 'rb') as csvfile: reader = csv.reader(csvfile) with open(out_file_name, 'wb') as outfile: writer = csv.writer(outfile) for i, row in enumerate(reader): if i > max_idx: break elif i not in tweets_dict: continue row[-1] = ' '.join(tweets_dict[i]) writer.writerow(row)
def loadDevData(path): data = loadData(path) ret = [] for sub in data: ret.append(list(sub['text'])) return ret
shape=[None, Y.shape[1]], name="Y") self.keepProb = tf.placeholder(tf.float32) self.defineCNN() self.defineLoss() self.SGD(X, Y, startLearningRate, miniBatchFraction, epoch, keepProb) def predict_proba(self, X): """ 使用神经网络对未知数据进行预测 """ sess = self.sess pred = tf.nn.softmax(logits=self.out, name="pred") prob = sess.run(pred, feed_dict={self.input: X, self.keepProb: 1.0}) return prob if __name__ == "__main__": data = loadData() trainData, validationData, trainLabel, validationLabel = train_test_split( data[0], data[1], test_size=0.3, random_state=1001) trainSet = {"X": trainData, "Y": trainLabel} validationSet = {"X": validationData, "Y": validationLabel} testSet = {"X": data[2], "Y": data[3]} # Windows下的存储路径与Linux并不相同 if os.name == "nt": ann = CNN("logs\\mnist_cnn", trainSet, validationSet, testSet) else: ann = CNN("logs/mnist_cnn", trainSet, validationSet, testSet) ann.fit()
def select_train(train_name): if train_name in ('1NN', '3NN', '5NN'): k = int(train_name[0]) return train_knn, k elif train_name == 'SVM': return train_svm, 0 # 0表示KinKNN无效 elif train_name == 'J48': return train_tree, 0 # 根据最优特征子集校验计算准确率准确率和维度缩减率 def check(trainX, trainY, predictX, predictY, optimal_feature_subset, feature, trainSelect, KinKNN): feature_list = numtofea(optimal_feature_subset, feature) data_sample = read_data_fea(feature_list, trainX) data_predict = read_data_fea(feature_list, predictX) accuracy = trainSelect(data_sample, trainY, data_predict, predictY, KinKNN) return accuracy if __name__ == '__main__': trainX, trainY, predictX, predictY, loop_condition, initialization_parameters = util.loadData('heart', 1, 2) num_fea_original = mat(trainX).shape[1] # 特征长度 feature = [] # 特征集合索引,特征集合的角标 trainName = 'J48' trainSelect = select_train(trainName) for i in range(num_fea_original): feature.append(i) accuracy = check(trainX, trainY, predictX, predictY, [1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0], feature, trainSelect, 1) print trainName + '验证准确率:', accuracy