Ejemplo n.º 1
0
def run():
    xs, ys = data_exploration.get_set('train',
                                      data_exploration.load_wiki_vector)
    xs_val, ys_val = data_exploration.get_set(
        'dev', data_exploration.load_wiki_vector)

    EMBEDDING_DIM = 300
    HIDDEN_DIM = 20

    model_lstm = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, 3)  # type: LSTMTagger
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model_lstm.parameters(), lr=0.1)

    with torch.no_grad():
        inputs = torch.tensor(xs[0], dtype=torch.float)
        tag_scores = model_lstm(inputs)
        print(tag_scores)

    train_evaluator = sampler_evaluator(xs, ys, 50)
    val_evaluator = sampler_evaluator(xs_val, ys_val, 50)

    print("len train: ", len(xs))
    s = 0
    losses = []
    for epoch in range(3):
        for e, (sentence, tag) in enumerate(zip(xs, ys)):
            model_lstm.zero_grad()
            tag_scores = model_lstm(torch.tensor(sentence, dtype=torch.float))
            loss = loss_function(tag_scores,
                                 torch.tensor([tag], dtype=torch.long))
            s += loss.item()
            if (e + 1) % 1000 == 0:
                print(
                    '\r' + ' ' * 30 +
                    '\rloss ({}/{}): {:05f}'.format(e + 1, len(xs), s / 1000),
                    end='')
                losses.append(s / 1000)
                s = 0

            # if (e + 1) % 50000 == 1000:
            #     tr_ev = train_evaluator(model_lstm)
            #     val_ev = val_evaluator(model_lstm)
            #     print(', train={:05f}, val={:05f}'.format(tr_ev, val_ev))
            loss.backward()
            optimizer.step()

        torch.save(model_lstm.state_dict(),
                   dr.data('model_{}.pt'.format(epoch)))
        with open(dr.data('losses.pickle'.format(epoch)), 'wb') as f:
            pickle.dump(losses, f)

        print('\n')
        predict(model_lstm)
        print(eval_model(xs_val, ys_val)(model_lstm))
Ejemplo n.º 2
0
def main():
    with open(dr.data('losses.pickle'), 'rb') as f:
        losses = pickle.load(f)

    plt.plot(losses)
    plt.xlabel("Cantidad de instancias (en miles)")
    plt.ylabel("Función de costo")
    plt.title(
        "Evaluación de la función de costo sobre el dataset de entrenamiento")
    plt.savefig(dr.data('losses.png'))
    plt.show()
Ejemplo n.º 3
0
def load_wiki_vector():
    global wiki_model
    if wiki_model is None:
        pickle_file = dr.data('wiki-news-300d-1M.pickle')

        if not os.path.exists(pickle_file):
            fname = dr.data('wiki-news-300d-1M.vec')
            res = {}
            with open(fname, 'r') as f:
                for l in f:
                    ls = l.split()
                    res[ls[0].lower()] = [float(x) for x in ls[1:]]
            with open(pickle_file, 'wb') as f:
                pickle.dump(res, f)
            wiki_model = res
        else:
            with open(pickle_file, 'rb') as f:
                wiki_model = pickle.load(f)
    return wiki_model
Ejemplo n.º 4
0
def predict(model):
    par_id, xs = data_exploration.get_test(data_exploration.load_wiki_vector)

    with open(dr.data('result.csv'), 'w') as f:
        f.write('pairID,gold_label\n')
        for pid, x in zip(par_id, xs):
            res = data_exploration.d_mun_ot[torch.argmax(
                model(torch.tensor(x, dtype=torch.float))).item()]
            f.write('{},{}\n'.format(pid, res))

    print("Done!")
Ejemplo n.º 5
0
def get_test(emb_f):
    sentences = []
    with open(dr.data("snli_1.0_test_filtered.jsonl"), 'r') as f:
        for l in f:
            d = json.loads(l)
            sentences.append({
                'pairID': d['pairID'],
                'sentence2': d['sentence2'].lower()
            })
    df = pd.DataFrame(sentences)
    xs = tokenize_sentences(df, emb_f())
    return df.pairID, xs
Ejemplo n.º 6
0
def read_data(_set='train'):
    filename_csv = dr.data('data_{}.csv'.format(_set))
    if os.path.exists(filename_csv):
        return pd.read_csv(filename_csv)
    else:
        sentences = []
        with open(dr.data('snli_1.0_{}_filtered.jsonl'.format(_set)),
                  'r') as f:
            for l in f:
                d = json.loads(l)
                sentences.append({
                    'pairID': d['pairID'],
                    'sentence2': d['sentence2'].lower()
                })
        df_sentence = pd.DataFrame(sentences).set_index('pairID')
        df_label = pd.read_csv(
            dr.data('snli_1.0_{}_gold_labels.csv'.format(_set))).set_index(
                'pairID')
        df = df_label.join(df_sentence)
        df = df[df.sentence2 == df.sentence2]
        df.to_csv(filename_csv)
        return df
Ejemplo n.º 7
0
import matplotlib.pyplot as plt
import data_reader
from model import LeNet
import os
from scipy import ndimage
from skimage.transform import resize
from skimage.data import imread
from skimage import color

training_file = 'data/train.p'
validation_file = 'data/valid.p'
testing_file = 'data/test.p'

is_debug = True

data = data_reader.data(training_file, validation_file, testing_file)
data.print_data_info()

# according to the info we know that there is 43 classes of sign

# use a dictionary to manage the key-label
label_dict = {}
with open('signnames.csv', 'r') as f:
    lines = f.readlines()
    for line in lines[1:]:
        tmp = line.strip('\n')
        tmp = tmp.split(',')
        label_dict[tmp[0]] = tmp[1]
    f.close()

Ejemplo n.º 8
0
def load_google_vectors():
    return gensim.models.KeyedVectors.load_word2vec_format(
        dr.data('GoogleNews-vectors-negative300.bin'), binary=True)