def run(): xs, ys = data_exploration.get_set('train', data_exploration.load_wiki_vector) xs_val, ys_val = data_exploration.get_set( 'dev', data_exploration.load_wiki_vector) EMBEDDING_DIM = 300 HIDDEN_DIM = 20 model_lstm = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, 3) # type: LSTMTagger loss_function = nn.NLLLoss() optimizer = optim.SGD(model_lstm.parameters(), lr=0.1) with torch.no_grad(): inputs = torch.tensor(xs[0], dtype=torch.float) tag_scores = model_lstm(inputs) print(tag_scores) train_evaluator = sampler_evaluator(xs, ys, 50) val_evaluator = sampler_evaluator(xs_val, ys_val, 50) print("len train: ", len(xs)) s = 0 losses = [] for epoch in range(3): for e, (sentence, tag) in enumerate(zip(xs, ys)): model_lstm.zero_grad() tag_scores = model_lstm(torch.tensor(sentence, dtype=torch.float)) loss = loss_function(tag_scores, torch.tensor([tag], dtype=torch.long)) s += loss.item() if (e + 1) % 1000 == 0: print( '\r' + ' ' * 30 + '\rloss ({}/{}): {:05f}'.format(e + 1, len(xs), s / 1000), end='') losses.append(s / 1000) s = 0 # if (e + 1) % 50000 == 1000: # tr_ev = train_evaluator(model_lstm) # val_ev = val_evaluator(model_lstm) # print(', train={:05f}, val={:05f}'.format(tr_ev, val_ev)) loss.backward() optimizer.step() torch.save(model_lstm.state_dict(), dr.data('model_{}.pt'.format(epoch))) with open(dr.data('losses.pickle'.format(epoch)), 'wb') as f: pickle.dump(losses, f) print('\n') predict(model_lstm) print(eval_model(xs_val, ys_val)(model_lstm))
def main(): with open(dr.data('losses.pickle'), 'rb') as f: losses = pickle.load(f) plt.plot(losses) plt.xlabel("Cantidad de instancias (en miles)") plt.ylabel("Función de costo") plt.title( "Evaluación de la función de costo sobre el dataset de entrenamiento") plt.savefig(dr.data('losses.png')) plt.show()
def load_wiki_vector(): global wiki_model if wiki_model is None: pickle_file = dr.data('wiki-news-300d-1M.pickle') if not os.path.exists(pickle_file): fname = dr.data('wiki-news-300d-1M.vec') res = {} with open(fname, 'r') as f: for l in f: ls = l.split() res[ls[0].lower()] = [float(x) for x in ls[1:]] with open(pickle_file, 'wb') as f: pickle.dump(res, f) wiki_model = res else: with open(pickle_file, 'rb') as f: wiki_model = pickle.load(f) return wiki_model
def predict(model): par_id, xs = data_exploration.get_test(data_exploration.load_wiki_vector) with open(dr.data('result.csv'), 'w') as f: f.write('pairID,gold_label\n') for pid, x in zip(par_id, xs): res = data_exploration.d_mun_ot[torch.argmax( model(torch.tensor(x, dtype=torch.float))).item()] f.write('{},{}\n'.format(pid, res)) print("Done!")
def get_test(emb_f): sentences = [] with open(dr.data("snli_1.0_test_filtered.jsonl"), 'r') as f: for l in f: d = json.loads(l) sentences.append({ 'pairID': d['pairID'], 'sentence2': d['sentence2'].lower() }) df = pd.DataFrame(sentences) xs = tokenize_sentences(df, emb_f()) return df.pairID, xs
def read_data(_set='train'): filename_csv = dr.data('data_{}.csv'.format(_set)) if os.path.exists(filename_csv): return pd.read_csv(filename_csv) else: sentences = [] with open(dr.data('snli_1.0_{}_filtered.jsonl'.format(_set)), 'r') as f: for l in f: d = json.loads(l) sentences.append({ 'pairID': d['pairID'], 'sentence2': d['sentence2'].lower() }) df_sentence = pd.DataFrame(sentences).set_index('pairID') df_label = pd.read_csv( dr.data('snli_1.0_{}_gold_labels.csv'.format(_set))).set_index( 'pairID') df = df_label.join(df_sentence) df = df[df.sentence2 == df.sentence2] df.to_csv(filename_csv) return df
import matplotlib.pyplot as plt import data_reader from model import LeNet import os from scipy import ndimage from skimage.transform import resize from skimage.data import imread from skimage import color training_file = 'data/train.p' validation_file = 'data/valid.p' testing_file = 'data/test.p' is_debug = True data = data_reader.data(training_file, validation_file, testing_file) data.print_data_info() # according to the info we know that there is 43 classes of sign # use a dictionary to manage the key-label label_dict = {} with open('signnames.csv', 'r') as f: lines = f.readlines() for line in lines[1:]: tmp = line.strip('\n') tmp = tmp.split(',') label_dict[tmp[0]] = tmp[1] f.close()
def load_google_vectors(): return gensim.models.KeyedVectors.load_word2vec_format( dr.data('GoogleNews-vectors-negative300.bin'), binary=True)