Beispiel #1
0
def worker(parser, rank, pos_sents, neg_sents, name, return_dict):
    """Parallel worker."""
    results = []
    num_correct = 0

    sentences = list(zip(pos_sents, neg_sents))
    if rank == 0:
        sentences = tqdm(list(zip(pos_sents, neg_sents)))

    for i, (pos, neg) in enumerate(sentences):

        pos_pp = parser.perplexity(pos)
        neg_pp = parser.perplexity(neg)
        correct = pos_pp < neg_pp
        num_correct += correct

        # see which words are unked during prediction
        pos = process_sentence(pos, parser.grammar.w2i)
        neg = process_sentence(neg, parser.grammar.w2i)

        result = (name, str(i), str(round(pos_pp, 2)), str(round(neg_pp, 2)),
                  str(int(correct)), ' '.join(pos), ' '.join(neg))
        results.append(result)

    return_dict[rank] = (results, num_correct)
Beispiel #2
0
def main():

    # load the data
    #en_es_train_dev, es_en_train, es_en_dev, es_en_test, mappings, u1, c1 = get_en_es_data(0,0)

    #for more datasets, uncomment the following two lines
    #es_en_train_dev, es_en_train, es_en_dev, es_en_test, mappings, u2, c2 = get_es_en_data(0, 0)
    fr_en_train_dev, fr_en_train, fr_en_dev, fr_en_test, mappings, u3, c3 = get_fr_en_data(0, 0)


    
    # convert the data to token to idx
    all_tokens = np.array(list(fr_en_train[0]) + list(fr_en_dev[0]) + list(fr_en_test[0]))
    token_to_idx = create_token_to_idx(all_tokens)

    # original code line: train_user_idx = prepare_data(es_en_train[0], es_en_train[1], token_to_idx, user_to_idx)
    # code now: split to processing tokens and importing the already processed metadata
    # the part for metadata originally had a shape of (num_of_exericses, MAX_TOKEN_SIZE) but it's now (num_of_exericses, MAX_TOKEN_SIZE, num_of_features)
    train_sentence_idx = process_sentence(fr_en_train_dev[0], token_to_idx)
    train_metadata = fr_en_train_dev[1]

    instance_id_to_dict = fr_en_train_dev[3]
    # convert true_labels to idx
    labels_array = np.zeros((len(fr_en_train_dev[0]), MAX_TOKEN_SIZE))
    for i in range(len(labels_array)):
        idx = np.array([instance_id_to_dict[i_id] for i_id in fr_en_train_dev[2][i]] + \
                       [0] * (MAX_TOKEN_SIZE - len(fr_en_train_dev[2][i])))
        labels_array[i] = idx

    model = EncoderDecoder(len(token_to_idx), mappings, 100, 100, 4, 100, 100)
    for j in range(10):
        print("Epoch ", j+1)
        total_loss = 0
        for i in tqdm(range(0, len(train_sentence_idx), BATCH_SIZE)):
            x_batch = train_sentence_idx[i: i+BATCH_SIZE]
            y_batch = labels_array[i: i+BATCH_SIZE]
            x_metadata_batch = train_metadata[i: i+BATCH_SIZE]

            mask = create_padding_mask(x_batch)
            with tf.GradientTape() as tape:
                logits = model.call(x_batch, x_metadata_batch, mask, training=True)
                loss = model.loss_function(logits, y_batch, mask)
                total_loss += loss.numpy()
            gradients = tape.gradient(loss, model.trainable_variables)
            model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        print("Avg batch loss", total_loss/i+1)

            # if i == 40:
            # break
        
        # print("====Dev ====")
        # flattened_instance_ids, actual, preds = predict(model, es_en_dev, token_to_idx)
        print("====Test====")
        flattened_instance_ids, actual, preds = predict(model, fr_en_test, token_to_idx)
Beispiel #3
0
def predict(model, data, token_to_idx):
    """The function is used to generate instance-wise predictions;
    The function computes the probability of getting the word 
    incorrect.
    
    Arguments:
        model {tf.model} -- The tensorflow model trained on the duolingo train data
        data {tuple} -- tuple containing the raw_data, user_data, and instance ids
        token_to_idx {dict} -- the mapping from token to idx
        user_to_idx {dict} -- the mapping from user to idx
    """
    raw_sent, raw_users, all_instance_ids, labels_dict  = data
    # TODO: same as above
    sent_idx = process_sentence(raw_sent, token_to_idx)
    user_idx = raw_users
    flattened_instance_ids = []
    
    # create the mask and predict the logits
    mask = create_padding_mask(sent_idx)

    actual = []
    preds = []

    for i in tqdm(range(0, len(sent_idx), BATCH_SIZE)):
        x_batch = sent_idx[i: i+BATCH_SIZE]
        x_user_batch = user_idx[i: i+BATCH_SIZE]
        instance_ids_list = all_instance_ids[i: i+BATCH_SIZE]

        mask = create_padding_mask(x_batch)
        logits = model.call(x_batch, x_user_batch, mask, training=False)
        probs = tf.nn.softmax(logits)
        predictions = probs[:, :, 1]

        # 
        assert len(preds) == len(actual)
        for j, instance_ids in enumerate(instance_ids_list):
            instance_ids_length = len(instance_ids)
            _preds = predictions[j][:instance_ids_length]
            true = [int(labels_dict[instance]) for instance in instance_ids]
            preds.extend(_preds.numpy().tolist())
            actual.extend(true)

            # add to final list of instance ids
            flattened_instance_ids.extend(instance_ids)

    compute_metrics(actual, preds)
    return flattened_instance_ids, actual, preds
Beispiel #4
0
def main():
    #for more datasets, uncomment the following two lines
    es_en_train_dev, es_en_train, es_en_dev, es_en_test, mappings, u2, c2 = get_es_en_data(
        0, 0)

    # convert the data to token to idx
    all_tokens = np.array(
        list(es_en_train[0]) + list(es_en_dev[0]) + list(es_en_test[0]))
    token_to_idx = create_token_to_idx(all_tokens)

    train_sentence_idx = process_sentence(es_en_train_dev[0], token_to_idx)
    train_metadata = es_en_train_dev[1]

    instance_id_to_dict = es_en_train_dev[3]
    # convert true_labels to idx
    labels_array = np.zeros((len(es_en_train_dev[0]), MAX_TOKEN_SIZE))
    for i in range(len(labels_array)):
        idx = np.array([instance_id_to_dict[i_id] for i_id in es_en_train_dev[2][i]] + \
                       [0] * (MAX_TOKEN_SIZE - len(es_en_train_dev[2][i])))
        labels_array[i] = idx

    model = EncoderDecoder(len(token_to_idx), mappings, 100, 100, 4, 100, 100)
    for j in range(10):
        print("Epoch ", j + 1)
        total_loss = 0
        for i in tqdm(range(0, len(train_sentence_idx), BATCH_SIZE)):
            x_batch = train_sentence_idx[i:i + BATCH_SIZE]
            y_batch = labels_array[i:i + BATCH_SIZE]
            x_metadata_batch = train_metadata[i:i + BATCH_SIZE]

            mask = create_padding_mask(x_batch)
            with tf.GradientTape() as tape:
                logits = model.call(x_batch,
                                    x_metadata_batch,
                                    mask,
                                    training=True)
                loss = model.loss_function(logits, y_batch, mask)
                total_loss += loss.numpy()
            gradients = tape.gradient(loss, model.trainable_variables)
            model.optimizer.apply_gradients(
                zip(gradients, model.trainable_variables))
        print("Avg batch loss", total_loss / i + 1)

        print("====Test====")
        flattened_instance_ids, actual, preds = predict(
            model, es_en_test, token_to_idx)
Beispiel #5
0
    def parse(self,
              sentence,
              verbose=True,
              use_numpy=False,
              num_trees=10,
              root='TOP'):
        processed_sentence = process_sentence(sentence, self.grammar.w2i)
        if verbose:
            print('Processed sentence: `{}`'.format(
                ' '.join(processed_sentence)))
            print('Running CKY...')
        score, back = self.cky(processed_sentence, use_numpy=use_numpy)

        root_id = self.grammar.n2i[root]
        score = score[root_id, 0, -1]

        if verbose:
            print('Building tree...')
        tree = self.build_tree(back, sentence, root=root)

        return tree, score
Beispiel #6
0
def make_dataset(root_path, annotation_path, sample_duration, dictionary):
    video2text = load_annotation_data(annotation_path)
    valid_videos = video2text.keys()

    dataset = []
    i = 0
    dataset_size = len(os.listdir(root_path))
    for i, video in enumerate(os.listdir(root_path)):
        if video == ".DS_Store" or video not in valid_videos:
            continue

        if i % 1000 == 0:
            print('dataset loading [{}/{}]'.format(i, dataset_size))

        video_path = os.path.join(root_path, video)
        if not os.path.exists(video_path):
            print(f"Path {video_path} not found!")
            continue

        n_frames = len(os.listdir(video_path))

        begin_t = 1
        end_t = n_frames
        sample = {
            'video': video_path,
            'segment': [begin_t, end_t],
            'n_frames': n_frames,
            'label': process_sentence(video2text[video], dictionary)
        }

        step = sample_duration
        video_sample = []
        for j in range(1, n_frames, step):
            sample_j = copy.deepcopy(sample)
            sample_j['frame_indices'] = list(
                range(j, min(n_frames + 1, j + sample_duration)))
            video_sample.append(sample_j)
        dataset.append(video_sample)
    return dataset
Beispiel #7
0
    def perplexity(self, sentence):
        processed_sentence = process_sentence(sentence, self.grammar.w2i)

        sent_len = len(sentence)

        sentence_array = np.array(
            [self.grammar.w2i[word] for word in processed_sentence],
            dtype=np.int32)

        score = -np.inf * np.ones(
            (self.grammar.num_nonterminals, sent_len + 1, sent_len + 1),
            dtype=np.float32)

        back = -1 * np.ones(
            (self.grammar.num_nonterminals, sent_len + 1, sent_len + 1, 3),
            dtype=np.int32)

        # Inside recursion
        logprob = _cky.inside(
            sentence_array,
            sent_len,
            score,
            self.grammar.num_lexical_rules,
            self.grammar.num_unary_rules,
            self.grammar.num_binary_rules,
            self.grammar.num_nonterminals,
            self.grammar.lexical,
            self.grammar.unary,
            self.grammar.binary,
            self.grammar.top,
            self.grammar.lexical_prob,
            self.grammar.unary_prob,
            self.grammar.binary_prob,
            self.grammar.top_prob,
        )

        return np.exp(-logprob / sent_len)
Beispiel #8
0
def syneval(parser,
            indir,
            outpath,
            parallel=False,
            short=False,
            add_period=True):

    print(f'Loading syneval examples from directory `{indir}`.')
    print(f'Writing predictions to `{outpath}`.')

    files = SHORT if short else ALL

    with open(outpath, 'w') as outfile:
        print('\t'.join(
            ('name', 'index', 'pos-perplexity', 'neg-perplexity', 'correct',
             'pos-sentence-processed', 'neg-sentence-processed')),
              file=outfile)

        print('Predicting syneval for:', '\n', '\n '.join(files))

        for fname in files:
            print(f'Predicting `{fname}`...')

            inpath = os.path.join(indir, fname)

            with open(inpath + '.pos') as f:
                pos_sents = [line.strip() for line in f.readlines()]
                if add_period:
                    pos_sents = [sent + ' .' for sent in pos_sents]

            with open(inpath + '.neg') as f:
                neg_sents = [line.strip() for line in f.readlines()]
                if add_period:
                    neg_sents = [sent + ' .' for sent in neg_sents]

            pos_sents = [sent.split() for sent in pos_sents]
            neg_sents = [sent.split() for sent in neg_sents]

            assert len(pos_sents) == len(neg_sents)

            if parallel:
                size = mp.cpu_count()
                print(f'Predicting in parallel with {size} processes...')

                chunk_size = ceil_div(len(pos_sents), size)
                pos_parts = [
                    pos_sents[i:i + chunk_size]
                    for i in range(0, len(pos_sents), chunk_size)
                ]
                neg_parts = [
                    neg_sents[i:i + chunk_size]
                    for i in range(0, len(neg_sents), chunk_size)
                ]

                # spawn processes
                manager = mp.Manager()
                return_dict = manager.dict()
                processes = []
                for rank in range(size):
                    p = mp.Process(target=worker,
                                   args=(parser, rank, pos_parts[rank],
                                         neg_parts[rank], fname, return_dict))
                    p.start()
                    processes.append(p)
                for p in processes:
                    p.join()

                results = sum([return_dict[rank][0] for rank in range(size)],
                              [])  # merge all results
                num_correct = sum([
                    return_dict[rank][1] for rank in range(size)
                ])  # sum number of correct results

            else:
                results = []
                num_correct = 0

                for i, (pos,
                        neg) in enumerate(tqdm(list(zip(pos_sents,
                                                        neg_sents)))):

                    pos_pp = parser.perplexity(pos)
                    neg_pp = parser.perplexity(neg)
                    correct = pos_pp < neg_pp
                    num_correct += correct

                    # see which words are unked during prediction
                    pos = process_sentence(pos, parser.grammar.w2i)
                    neg = process_sentence(neg, parser.grammar.w2i)

                    result = (fname, str(i), str(round(pos_pp, 2)),
                              str(round(neg_pp, 2)), str(int(correct)),
                              ' '.join(pos), ' '.join(neg))
                    results.append(result)

            for result in results:
                print('\t'.join(result), file=outfile)

            print(
                f'{fname}: {num_correct}/{len(pos_sents)} = {num_correct / len(pos_sents):.2%} correct',
                '\n')
Beispiel #9
0
def main():

    # load the data
    en_es_train_dev, es_en_train, es_en_dev, es_en_test, mappings1, u1, c1 = get_en_es_data(0,0)

    #for more datasets, uncomment the following two lines
    es_en_train_dev, es_en_train, es_en_dev, es_en_test, mappings2, u2, c2 = get_es_en_data(u1, c1)
    fr_en_train_dev, fr_en_train, fr_en_dev, fr_en_test, mappings3, u3, c3 = get_fr_en_data(u2, c2)

    ##combine train_dev for all three datasets
    #get each attributes
    en_es_sentence, en_es_meta, en_es_inst, en_es_label = en_es_train_dev
    es_en_sentence, es_en_meta, es_en_inst, es_en_label = es_en_train_dev
    fr_en_sentence, fr_en_meta, fr_en_inst, fr_en_label = fr_en_train_dev

    #concatenate
    print(en_es_sentence.shape)
    print(es_en_sentence.shape)
    print(fr_en_sentence.shape)

    print(en_es_meta.shape)
    print(es_en_meta.shape)
    print(fr_en_meta.shape)

    print(en_es_inst.shape)
    print(es_en_inst.shape)
    print(fr_en_inst.shape)
    

    combined_sentence = np.concatenate((en_es_sentence, es_en_sentence, fr_en_sentence), axis=0)
    combined_meta = np.concatenate((en_es_meta, es_en_meta, fr_en_meta), axis=0)
    combined_inst = np.concatenate((en_es_inst, es_en_inst, fr_en_inst), axis=0)
    #combine labels
    combined_labels = copy.deepcopy(en_es_label)
    combined_labels.update(es_en_label) #add es_en to dict
    combined_labels.update(fr_en_label) #add fr_en to dict

    index = np.random.permutation(combined_sentence.shape[0])
    shuffled_combined_sentence = combined_sentence[index]
    shuffled_combined_meta = combined_meta[index]
    shuffled_combined_inst = combined_inst[index]

    combined_train_dev = (shuffled_combined_sentence, shuffled_combined_meta, shuffled_combined_inst, combined_labels)

    #combine mappings1,mappings2,mappings3
    usid1, ctid1, clt1, sessid1, fmatid1, speechid1, dep1, morph1 = mappings1
    usid2, ctid2, clt2, sessid2, fmatid2, speechid2, dep2, morph2 = mappings2
    usid3, ctid3, clt3, sessid3, fmatid3, speechid3, dep3, morph3 = mappings3
    usid = combine_dicts(usid1, usid2, usid3)
    ctid = combine_dicts(ctid, ctid2, ctid3)
    clt = combine_dicts(clt1, clt2, clt3)
    sess = combine_dicts(sessid1, sessid2, sessid3)
    fmat = combine_dicts(fmatid1, fmatid2, fmatid3)
    speech = combine_dicts(speechid1, speechid2, speechid3)
    dep = combine_dicts(dep1, dep2, dep3)
    morph = combine_dicts(morph1, morph2, morph3)
    combined_mappings = (usid, ctid, clt, sess, fmat, speech, dep, morph)
    



    # convert the data to token to idx
    all_tokens = np.array(list(es_en_train[0]) + list(es_en_dev[0]) + list(es_en_test[0]) + \
                          list(en_es_train[0]) + list(en_es_dev[0]) + list(en_es_test[0]) + \
                          list(fr_en_train[0]) + list(fr_en_dev[0]) + list(fr_en_test[0]))
    token_to_idx = create_token_to_idx(all_tokens)

    # TODO: 
    # original code line: train_user_idx = prepare_data(es_en_train[0], es_en_train[1], token_to_idx, user_to_idx)
    # code now: split to processing tokens and importing the already processed metadata
    # the part for metadata originally had a shape of (num_of_exericses, MAX_TOKEN_SIZE) but it's now (num_of_exericses, MAX_TOKEN_SIZE, num_of_features)
    train_sentence_idx = process_sentence(combined_train_dev[0], token_to_idx)
    train_metadata = combined_train_dev[1]

    instance_id_to_dict = combined_train_dev[3]
    # convert true_labels to idx
    labels_array = np.zeros((len(combined_train_dev[0]), MAX_TOKEN_SIZE))
    for i in range(len(labels_array)):
        idx = np.array([instance_id_to_dict[i_id] for i_id in combined_train_dev[2][i]] + \
                       [0] * (MAX_TOKEN_SIZE - len(combined_train_dev[2][i])))
        labels_array[i] = idx

    model = EncoderDecoder(len(token_to_idx), combined_mappings, 300, 300, 4, 100, 100)
    for j in range(10):
        print("Epoch ", j+1)
        # TODO: shuffle training data
        total_loss = 0
        for i in tqdm(range(0, len(train_sentence_idx)/50, BATCH_SIZE)):
            x_batch = train_sentence_idx[i: i+BATCH_SIZE]
            y_batch = labels_array[i: i+BATCH_SIZE]

            x_metadata_batch = train_metadata[i: i+BATCH_SIZE]

            mask = create_padding_mask(x_batch)
            with tf.GradientTape() as tape:
                logits = model.call(x_batch, x_metadata_batch, mask, training=True)
                loss = model.loss_function(logits, y_batch, mask)
                total_loss += loss.numpy()
            gradients = tape.gradient(loss, model.trainable_variables)
            model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        print("Avg batch loss", total_loss/(i+1))

            # if i == 40:
            # break
        
        # print("====Dev ====")
        # flattened_instance_ids, actual, preds = predict(model, es_en_dev, token_to_idx)
        print("====Test====")
        flattened_instance_ids1, actual1, preds1 = predict(model, es_en_test, token_to_idx)
        flattened_instance_ids2, actual2, preds2 = predict(model, en_es_test, token_to_idx)
        flattened_instance_ids3, actual3, preds3 = predict(model, fr_en_test, token_to_idx)
Beispiel #10
0
pad_idx = inputText.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# In[12]:

# Loading the pretrained model
if load_model:
    load_checkpoint(torch.load("./training/data/model/my_checkpoint.pth.tar"),
                    model, optimizer)

# In[13]:

# Example sentence
src = "The feasibility study estimates that it would take passengers about four minutes to cross the Potomac River on the gondola."

prediction = process_sentence(model, src, inputText, outputText, device)
prediction = prediction[:-1]  # remove <eos> token

print(prediction)

# ## Evaluation

# In[14]:

# Loading the test dataset
test_data = pd.read_csv("./training/test/test-sample.txt",
                        header=0,
                        names=['InputText', 'OutputText'],
                        sep='\t',
                        encoding='utf-8')