Python populate_embeddings Exemples

Langage de programmation: Python

Espace de nommage/Pack: deep_disfluency.embeddings.load_embeddings

Méthode/Fonction: populate_embeddings

Exemples au hotexamples.com: 2

Python populate_embeddings - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de deep_disfluency.embeddings.load_embeddings.populate_embeddings extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

def load_embeddings(self, embeddings_name): # load pre-trained embeddings embeddings_dir = os.path.dirname(os.path.realpath(__file__)) +\ "/../embeddings/" pretrained = gensim.models.Word2Vec.load(embeddings_dir + embeddings_name) print "emb shape", pretrained[pretrained.index2word[0]].shape # print pretrained[0].shape # assign and fill in the gaps emb = populate_embeddings(self.args.emb_dimension, len(self.word_to_index_map.items()), self.word_to_index_map, pretrained) self.model.load_weights(emb=emb)

Exemple #2

0

Afficher le fichier

def run_experiment(args): #make the args into a dict s = dict() for feat, val in args._get_kwargs(): s[feat] = val #print feat,val print s #raw_input() if s['acoustic']: s['acoustic'] = 350 #dimension of acoustic features vector (essentially 7 * 50) pre_load_training = True # TODO if not on Bender this is quite big so can't load training- switch to true on Bender print "loading data and tag sets" #NB Don't get the tag sets here directly anymore _, _, _, train_dict = switchboard_data(train_data=s['train_data'], tags=s['tags']) #get relevant dictionaries idx2label = dict((k, v) for v, k in train_dict['labels2idx'].iteritems() ) # first half (28) the same as the test idx2word = dict((k, v) for v, k in train_dict['words2idx'].iteritems()) if not train_dict.get('pos2idx') == None: idx2pos = dict((k, v) for v, k in train_dict['pos2idx'].iteritems()) range_dir = "../data/disfluency_detection/swda_divisions_disfluency_detection/" range_files = [ range_dir + "SWDisfTrainWithAudio_ranges.text", #range_dir + "SWDisfHeldout_ranges.text", #for testing locally range_dir + "SWDisfHeldout_ranges.text", range_dir + "SWDisfTest_ranges.text", range_dir + "SWDisfHeldoutASR_ranges.text", range_dir + "SWDisfTestASR_ranges.text" ] #get the filenames for the train/heldout/test division data = defaultdict( list ) #we will shuffle the train dialogues, keep heldout and test non-scrambled: divisions = ["heldout", "test", "heldout_asr", "test_asr"] if not s['use_saved_model']: divisions = ["train", "heldout", "test"] for key, rangefile in zip(divisions, range_files): file_ranges = [line.strip("\n") for line in open(rangefile)] if s['use_saved_model']: if "train" in key: continue if "asr" in key: corpus = key[:key.find("_")] corpus = corpus[0].upper() + corpus[1:] increco_file = "../data/asr_results/SWDisf{}_pos_increco.text".format( corpus) data[key] = load_increco_data_from_file( increco_file, train_dict['words2idx'], train_dict['pos2idx']) else: #corpus = corpus[0].upper() + corpus[1:] file = "../data/disfluency_detection/switchboard/swbd_{}_partial_timings_data.csv".format( key) data[key] = load_data_from_timings_file( file, train_dict['words2idx'], train_dict['pos2idx']) #print filter(lambda x : x[0] == "4649A",[ data[key][i] for i in range(0, len(data[key]))])[0] continue #if not using saved model load from pickles for f in file_ranges: for part in ["A", "B"]: dialogue_speaker_data = "../data/audio/" + f + part + ".npy" #if we're pre-loading up data, which we do anyway for the test/heldout #but also for training when using a big machine if key != "train" or pre_load_training: print "loading", dialogue_speaker_data dialogue_speaker_data = np.load(dialogue_speaker_data) dialogue_speaker_data = load_data_from_array( dialogue_speaker_data, n_acoust=s['acoustic'], cs=s['window'], tags=s["tags"], full_idx_to_label_dict=full_idx2label, idx_to_label=idx2label) data[key].append((f + part, dialogue_speaker_data )) #tuple of name + data/file location vocsize = len(train_dict['words2idx'].items()) nclasses = len(train_dict['labels2idx'].items()) #nsentences = len(train_lex) possize = None if not train_dict.get('pos2idx') == None: possize = len(idx2pos.items()) #nwords = len(list(itertools.chain(*train_y))) print str(len(train_dict['labels2idx'].items())) + " training classes" print str(len(train_dict['words2idx'].items())) + " words in vocab" if not train_dict.get('pos2idx') == None: print str(len(train_dict['pos2idx'].items())) + " pos tags in vocab" #print str(nsentences) + " training sequences" na = 0 if s['acoustic']: print "with acoustic data..." na = s['acoustic'] print "instantiating model " + s['model'] + "..." np.random.seed(s['seed']) rnn = None random.seed(s['seed']) if s['model'] == 'elman': rnn = Elman(ne=vocsize, de=s['emb_dimension'], nh=s['nhidden'], na=na, n_out=nclasses, cs=s['window'], npos=possize, update_embeddings=s['update_embeddings']) elif s['model'] == 'lstm': rnn = LSTM(ne=vocsize, de=s['emb_dimension'], n_lstm=s['nhidden'], na=na, n_out=nclasses, cs=s['window'], npos=possize, lr=s['lr'], single_output=True, output_activation=T.nnet.softmax, cost_function='nll') if s['decoder_file']: print "instantiating hmm decoder..." #add the interregnum tag (not predicted by the rnn, derived from context) hmm_dict = deepcopy(train_dict['labels2idx']) intereg_ind = len(hmm_dict.keys()) hmm_dict["<i/><cc>"] = intereg_ind #add the interregnum tag print "loading timing model" with open('../decoder/LogReg_balanced_timing_classifier.pkl', 'rb') as fid: timing_model = cPickle.load(fid) with open('../decoder/LogReg_balanced_timing_scaler.pkl', 'rb') as fid: timing_model_scaler = cPickle.load(fid) hmm = FirstOrderHMM(hmm_dict, rnn=rnn, markov_model_file=s['tags'], timing_model=timing_model, timing_model_scaler=timing_model_scaler) else: hmm = None if s['embeddings']: print "loading embeddings..." pretrained = gensim.models.Word2Vec.load( "../embeddings/" + s['embeddings']) # load pre-trained embeddings print pretrained[pretrained.index2word[0]].shape #print pretrained[0].shape emb = populate_embeddings(s['emb_dimension'], vocsize, train_dict['words2idx'], pretrained) #assign and fill in the gaps rnn.load_weights(emb) #make folder for the whole experiment folder = os.path.basename(__file__).split('.')[0].replace( "run_experiment", "") + s['exp_id'] #folder = "/home/dsg-labuser/Desktop/rnn_experiments/"+ s['exp_id'] if os.path.exists(folder): if not s['use_saved_model']: quit = raw_input( 'Overwrite contents of folder for experiment {}? [y][n]'. format(s['exp_id'])) if quit != "y": return else: os.mkdir(folder) #make results files if not s['use_saved_model']: resultsFile = open("results/learning_curves/" + s['exp_id'] + ".csv", "w") resultsFile.write( "epoch,heldout_loss,heldout_class_loss,val_tags_f,heldout_f_rmtto,heldout_frm,heldout_tto1,heldout_tto2,test_loss,test_class_loss,test_tags_f,test_f_rmtto,test_frm,test_tto1,test_tto2,train_loss\n" ) summariesFile = open("results/tag_accuracies/" + s['exp_id'] + ".text", "w") #set current learning rate as the initial learning rate s['clr'] = s['lr'] s['best_epoch'] = 0 best_f1 = -np.inf #lowest f-score possible print "training..." start = 1 end = s['nepochs'] if s['use_saved_model']: start = s['use_saved_model'] #start from the epoch of the stored model end = start for e in range(start, end + 1): s['current_epoch'] = e tic = time.time() epochfolder = folder + "/epoch_" + str( e) #make folder to store this epoch if not os.path.exists(epochfolder) and not s['use_saved_model']: os.mkdir(epochfolder) if s['use_saved_model']: print "loading stored model and weights- not actually training from " + epochfolder rnn.load_weights_from_folder(epochfolder) #if s['model'] == "lstm": # rnn.load_weights_from_folder(epochfolder) #else: # emb, Wx, Wh, W, bh, b, h0 = rnn.load(epochfolder) # rnn.load_weights(emb, Wx, Wh, W, bh, b, h0) else: if s['pos']: train_loss = rnn.fit(data['train'], s['clr'], acoustic=args.acoustic, load_data=pre_load_training == False) else: pass if s['verbose']: # output final learning time print '[learning] epoch %i >>' % ( e), 'completed in %.2f (sec) <<\r' % (time.time() - tic), print "saving predictions and evaluating tags..." results = {} for corpus in ['heldout', 'test']: #nb for training just test and heldout #if not corpus == 'heldout': continue print corpus predictions_file = epochfolder + '/predictions_{}.csv'.format( corpus) incremental_eval = False if s['use_saved_model']: predictions_file = epochfolder + '/predictions_inc_{}.csv'.format( corpus) incremental_eval = True corpus_results = save_predictions_and_quick_eval(predictions_filename=predictions_file,\ groundtruth_filename=s['{}_file'.format(corpus.replace("_asr",""))], model=rnn, hmm=hmm, dialogues=data[corpus], idx_to_label_dict=idx2label, idx_to_word_dict=idx2word, incremental_eval=incremental_eval, s=s, increco_style="_asr" in corpus or incremental_eval) for key, val in corpus_results.items(): results[corpus + '_' + key] = val if s['use_saved_model']: #for using saved model we assume only one eval on that given epoch #we want to evaluate it fully incrementally/do the error anlaysis return {} #NB for now not bothering with heldout/test loss #results['heldout_loss'] = 100.0 #results['test_loss'] = 100.0 print "saving epoch folder and writing results to file" rnn.save(epochfolder) #Epoch file dump coi = 'rmtto' #class of interest resultsFile.write(str(e)+","+str(results['heldout_loss'])+","+\ str(results['heldout_class_loss'])+","+\ str(results['heldout_f1_tags']) +','+\ str(results['heldout_f1_'+coi]) +","+\ str(results['heldout_f1_rm']) +","+\ str(results['heldout_f1_tto1']) +","+\ str(results['heldout_f1_tto2']) +","+\ str(results['test_loss'])+","+\ str(results['test_class_loss'])+','+\ str(results['test_f1_tags'])+','+\ str(results['test_f1_'+coi])+","+\ str(results['test_f1_rm']) +","+\ str(results['test_f1_tto1']) +","+\ str(results['test_f1_tto2']) +","+\ str(train_loss)+"\n") resultsFile.flush() summariesFile.write( str(e) + "\n" + results['heldout_tag_summary'] + "\n" + results['test_tag_summary'] + "\n%%%%%%%%%%\n") summariesFile.flush() #check to see if it beats the current best on the class of interest #stopping criterion if results['heldout_f1_' + coi] > best_f1: rnn.save(folder) best_f1 = results['heldout_f1_' + coi] if s['verbose']: print 'NEW BEST raw labels at epoch ', e, 'best valid', best_f1 print 'NEW BEST: epoch', e, 'heldout F1', results[ 'heldout_f1_' + coi], 'best test F1', results['test_f1_' + coi], ' ' * 20 s['vf1'] = results['heldout_f1_' + coi] s['tf1'] = results['test_f1_' + coi] s['best_epoch'] = e #moves this to the main folder, will have the HMM generated predictions #subprocess.call(['mv', epochfolder + '/test.txt', folder + '/best_test.txt']) #subprocess.call(['mv', epochfolder + '/valid.txt', folder + '/best_valid.txt']) # stopping criteria = if no improvement in 10 epochs if s['current_epoch'] - s['best_epoch'] >= 10: print "stopping, no improvement in 10 epochs" break #decay if s['decay'] and abs(s['best_epoch'] - s['current_epoch']) >= 2: s['clr'] *= 0.85 #just a steady decay if things aren't improving for 2 epochs, more a hyper param? print "learning rate decayed, now ", s['clr'] if s['clr'] < 1e-5: print "stopping, below learning rate threshold" break if s['verbose']: # output final testing time print '[learning] epoch %i >>' % ( e), 'testing in %.2f (sec) <<\r' % (time.time() - tic), print 'BEST RESULT: epoch', s['best_epoch'], 'valid F1', s[ 'vf1'], 'best test F1', s['tf1'], 'with the model', folder resultsFile.close() summariesFile.close()