Esempio n. 1
0
    def transform(self, X1, Y=None):
        sents = []
        lengths = []

        # transform label X
        for sent in X1:
            word_ids = []
            for w in sent:
                w = self._lower(w)
                w = self._normalize_num(w)
                if w in self.vocab_word:
                    word_id = self.vocab_word[w]
                else:
                    word_id = self.vocab_word[UNK]
                word_ids.append(word_id)
            lengths.append(len(word_ids))
            sents.append(word_ids)

        # transform label Y
        if Y is not None:
            sent_labels = [[self.vocab_tag[l] for l in labels] for labels in Y]
        else:
            sent_labels = None

        # sequence_length
        sequence_length = np.asarray(lengths)

        # padding
        X_result = pad_sequences(sents, 0, max_length=self.max_length)
        Y_result = pad_sequences(sent_labels, 0, max_length=self.max_length)
        intput_mask = np.array((Y_result > 0), dtype=np.float32)
        X_result = [X_result, intput_mask, sequence_length]
        return X_result, Y_result
Esempio n. 2
0
    def _pad_feed_dict(self, word_ids,
                       char_ids, cap_ids,
                       start_ids, end_ids,
                       gold_labels, att_ids,
                       dropout, hidden_dropout):
        fd = {}
        word_ids, sent_lenghts = pad_sequences(word_ids,0)
        fd[self.word_ids] = word_ids
        fd[self.sent_lengths] = sent_lenghts

        if self.parameters['char_dim']:
            char_ids, word_lenghts = pad_sequences(char_ids,0,2)
            fd[self.char_ids] = char_ids
            fd[self.word_lengths] = word_lenghts
        if self.parameters['cap_dim']:
            cap_ids, _ = pad_sequences(cap_ids,0)
            fd[self.cap_ids] = cap_ids
        att_ids, markable_length = pad_sequences(att_ids, [0, 0],max_len=self.parameters['max_len'])
        fd[self.att_ids] = att_ids
        fd[self.markable_lengths] = markable_length
        if self.parameters['len_dim']:
            len_ids = [e[1]-s[1] for s, e in zip(start_ids, end_ids)]
            fd[self.len_ids] = len_ids

        fd[self.start_ids] = start_ids
        fd[self.end_ids] = end_ids
        fd[self.gold_labels] = gold_labels

        fd[self.dropout] = dropout
        fd[self.hidden_dropout] = hidden_dropout

        return fd, sent_lenghts
Esempio n. 3
0
    def run_one_epoch(self, ep, train_ds, valid_ds, batch_size):

        losses = []
        i = 0
        score = 0
        for xbatch, ybatch in minibatch(train_ds, batch_size):
            i += 1
            word_seq, sequence_len = pad_sequences(xbatch)
            target_seq, _ = pad_sequences(ybatch)

            # build feed dictionary
            feed = {
                self.word_ids: word_seq,
                self.labels: target_seq,
                self.sequence_lengths: sequence_len,
                self.learning_rate: self.lr,
                self.keep_dropout_rate: self.kdr
            }

            _, train_loss = self.sess.run([self.train_op, self.loss],
                                          feed_dict=feed)
            losses += [train_loss]

            if i % 10 == 0:
                print('ep:', ep, 'iter:', i, 'loss:', np.mean(losses))
            if i % 50 == 0:
                acc_score, _ = self.run_validation(valid_ds, batch_size)
                print('accuracy', acc_score)

        if acc_score == 0:
            acc_score, _ = self.run_validation(valid_ds, batch_size)

        metrics = {}
        metrics['acc'] = acc_score
        return metrics
Esempio n. 4
0
    def transform(self, X):
        results1 = []
        results2 = []
        feature_set1 = ["amod", "nsubj", "dep"]
        feature_set2 = ["dobj", "nsubj", "dep"]
        for sent, depency in X:
            result1 = [0] * len(sent)
            result2 = [0] * len(sent)
            dependents, governor = depency
            for i in range(1, len(governor)):
                gov_idx, relation = governor[i]
                if relation in feature_set2:
                    result2[i - 1] = 1
                if relation in feature_set1:
                    result1[gov_idx - 1] = 1
            results1.append(result1)
            results2.append(result2)

        padded_result1, sequence_length = pad_sequences(results1, pad_tok=0)
        padded_result2, _ = pad_sequences(results2, pad_tok=0)
        padded_result1_np = np.array(padded_result1, dtype=np.float32).reshape(
            len(padded_result1), len(padded_result1[0]), 1)
        padded_result2_np = np.array(padded_result2, dtype=np.float32).reshape(
            len(padded_result2), len(padded_result2[0]), 1)
        xxx = np.concatenate((padded_result1_np, padded_result2_np), axis=-1)
        return xxx, sequence_length
Esempio n. 5
0
def pad_features(features, params):
    maxlen = params['max_len']
    new_features = {}
    for fea in features.keys():
        if fea == 'dep_path':
            continue
        if fea == 'has_dep':  # not helping, not used
            continue
            num_example = len(features[fea])
            seq = np.zeros([num_example, maxlen, params['dep_size']])
            for i in xrange(num_example):
                for j, deps in itertools.izip(xrange(maxlen),
                                              features[fea][i]):
                    for dep in deps:
                        seq[i, j, dep] = 1.0
        elif fea == 'dep':
            seq = utils.pad_sequences(features[fea], 5, value=0)
        elif fea == 'pos1' or fea == 'pos2':
            seq = utils.pad_sequences(features[fea],
                                      maxlen,
                                      value=maxlen * 2 - 1)
        elif fea == 'bag_size' or fea == 'mask' or fea == 'weight':
            seq = np.array(features[fea])  # no change
        else:
            seq = utils.pad_sequences(features[fea], maxlen, value=0)
        new_features[fea] = seq
    return new_features
Esempio n. 6
0
    def _next_batch(self, data):
        """

        :param dataset.Dataset data:
        :return:
        """
        start = 0
        idx = 0
        while start < len(data.words):
            l_batch = data.labels[start:start + self.batch_size]
            labels, _ = pad_sequences(l_batch, pad_tok=0, nlevels=1)

            w_batch = data.words[start:start + self.batch_size]
            c_batch = data.chars[start:start + self.batch_size]
            pos_batch = data.poses[start:start + self.batch_size]
            word_ids, sequence_lengths = pad_sequences(w_batch,
                                                       pad_tok=0,
                                                       nlevels=1)
            char_ids, word_lengths = pad_sequences(c_batch,
                                                   pad_tok=0,
                                                   nlevels=2)
            pos_ids, _ = pad_sequences(pos_batch, pad_tok=0, nlevels=1)

            start += self.batch_size
            idx += 1
            batch_data = {
                self.sequence_lens: sequence_lengths,
                self.labels: labels,
                self.word_ids: word_ids,
                self.char_ids: char_ids,
                self.word_lengths: word_lengths,
                self.pos_ids: pos_ids,
            }
            yield batch_data
Esempio n. 7
0
    def get_feed_dict(self, words, labels=None, LR=None, dropout=None):
        if self.config.chars:
            char_ids, word_ids = zip(*words)  # ==========important
            word_ids, sequence_lengths = pad_sequences(word_ids, pad_tok=0)
            char_ids, word_lengths = pad_sequences(
                char_ids, pad_tok=0, nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, pad_tok=0)

        feed_dict = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }
        if self.config.chars:
            feed_dict[self.char_ids] = char_ids
            feed_dict[self.word_lengths] = word_lengths
        if labels is not None:
            labels, _ = pad_sequences(labels, pad_tok=0)
            feed_dict[self.labels] = labels
        if LR is not None:
            feed_dict[self.LR] = LR
        if dropout is not None:
            feed_dict[self.dropout] = dropout

        return feed_dict, sequence_lengths
Esempio n. 8
0
def load_te_dataset(filename, token2id, label2id):
    labels = []
    padded_premises = []
    padded_hypotheses = []
    original_premises = []
    original_hypotheses = []

    with open(filename) as in_file:
        reader = csv.reader(in_file, delimiter="\t")

        for row in reader:
            label = row[0].strip()
            premise_tokens = row[1].strip().split()
            hypothesis_tokens = row[2].strip().split()
            premise = row[4].strip()
            hypothesis = row[5].strip()
            labels.append(label2id[label])
            padded_premises.append([token2id.get(token, token2id["#unk#"]) for token in premise_tokens])
            padded_hypotheses.append([token2id.get(token, token2id["#unk#"]) for token in hypothesis_tokens])
            original_premises.append(premise)
            original_hypotheses.append(hypothesis)

        padded_premises = pad_sequences(padded_premises, padding="post", value=token2id["#pad#"], dtype=np.long)
        padded_hypotheses = pad_sequences(padded_hypotheses, padding="post", value=token2id["#pad#"], dtype=np.long)
        labels = np.array(labels)

        return labels, padded_premises, padded_hypotheses, original_premises, original_hypotheses
    def _get_feed_dict(self, words, labels=None, lr=None, is_train=None):
        '''

        :param words:
        :param labels:
        :param lr:
        :param is_train:
        :return:
        '''
        word_ids, char_ids = zip(*words)
        word_ids, seq_len = pad_sequences(word_ids,
                                          max_length=None,
                                          pad_tok=0,
                                          nlevels=1)
        feed_dict = {self.word_ids: word_ids, self.seq_len: seq_len}
        if self.cfg.use_char_emb:
            char_ids, word_len = pad_sequences(char_ids,
                                               max_length=None,
                                               pad_tok=0,
                                               max_length_2=None,
                                               nlevels=2)
            #上一步返回的是 一三维的数组 [batch_size,max_len_sen,max_len_word]
            feed_dict[self.char_ids] = char_ids
            feed_dict[self.word_len] = word_len
        if labels is not None:
            feed_dict[self.labels] = labels
        if lr is not None:
            feed_dict[self.lr] = lr
        if is_train is not None:
            feed_dict[self.is_train] = is_train
        return feed_dict
Esempio n. 10
0
 def transform(self, X):
     results1 = []
     results2 = []
     fea_keys1 = self.features_dict.keys()
     fea_keys2 = self.features_dict2.keys()
     for words, depency in X:
         result1 = [0] * len(words)
         result2 = [0] * len(words)
         pre_words = [re.sub(r"\d{1,10}", "0", word) for word in words]
         for key in fea_keys1:
             asp_words = key.split()
             idents_idx = self.identify2(pre_words, asp_words)
             if len(idents_idx) > 0:
                 for ident_idx in idents_idx:
                     from_idx, to_idx = ident_idx
                     for j in range(from_idx, to_idx):
                         result1[j] = 1
         for idx, word in enumerate(pre_words):
             if word in fea_keys2:
                 result2[idx] = 1
         results1.append(result1)
         results2.append(result2)
     padded_result1, sequence_length = pad_sequences(results1, pad_tok=0)
     padded_result2, _ = pad_sequences(results2, pad_tok=0)
     padded_result1_np = np.array(padded_result1, dtype=np.float32).reshape(
         len(padded_result1), len(padded_result1[0]), 1)
     padded_result2_np = np.array(padded_result2, dtype=np.float32).reshape(
         len(padded_result2), len(padded_result2[0]), 1)
     xxx = np.concatenate((padded_result1_np, padded_result2_np), axis=-1)
     return xxx, sequence_length
Esempio n. 11
0
    def predict_embeddings(self, words_to_drop):
        batches_i, sents_i, words, left_contexts, right_contexts = list(
            zip(*words_to_drop))

        vectorized_words = [[self.comick.characters_vocabulary[c] for c in w]
                            for w in words]
        words_lengths = torch.LongTensor([len(w) for w in words])
        padded_words = pad_sequences(vectorized_words, words_lengths)

        vectorized_left_contexts = [l.data for l in left_contexts]
        left_contexts_length = torch.LongTensor(
            [len(c) for c in left_contexts])
        padded_left = pad_sequences(vectorized_left_contexts,
                                    left_contexts_length)

        vectorized_right_contexts = [l.data for l in right_contexts]
        right_contexts_length = torch.LongTensor(
            [len(c) for c in right_contexts])
        padded_right = pad_sequences(vectorized_right_contexts,
                                     right_contexts_length)

        use_gpu = torch.cuda.is_available()
        if use_gpu:
            padded_left = padded_left.cuda()
            padded_words = padded_words.cuda()
            padded_right = padded_right.cuda()

        embeddings = self.comick(
            (Variable(padded_left), Variable(padded_words),
             Variable(padded_right)))
        attentions = self.comick.get_attentions()

        for si, i, embedding, attention in zip(batches_i, sents_i, embeddings,
                                               attentions):
            yield (si, i, embedding, attention)
Esempio n. 12
0
    def predict_embeddings(self, words_to_drop):
        batches_i, sents_i, words, left_contexts, right_contexts, contexts = list(
            zip(*words_to_drop))

        vectorized_words = self.comick.vectorize_words(words)
        words_lengths = torch.LongTensor([len(w) for w in vectorized_words])
        padded_words = pad_sequences(vectorized_words, words_lengths)

        vectorized_contexts = [l.data.cpu() for l in contexts]
        contexts_length = torch.LongTensor([len(c) for c in contexts])
        padded = pad_sequences(vectorized_contexts, contexts_length)

        # vectorized_left_contexts = [l.data.cpu() for l in left_contexts]
        # left_contexts_length = torch.LongTensor([len(c) for c in left_contexts])
        # padded_left = pad_sequences(vectorized_left_contexts, left_contexts_length)

        # vectorized_right_contexts = [l.data.cpu() for l in right_contexts]
        # right_contexts_length = torch.LongTensor([len(c) for c in right_contexts])
        # padded_right = pad_sequences(vectorized_right_contexts, right_contexts_length)

        use_gpu = torch.cuda.is_available()
        if use_gpu:
            padded = padded.cuda()
            padded_words = padded_words.cuda()
            # padded_right = padded_right.cuda()

        embeddings, attentions = self.comick((padded, padded_words))

        if self.comick.attention:
            for si, i, embedding, attention, in zip(batches_i, sents_i,
                                                    embeddings, attentions):
                yield (si, i, embedding, attention)
        else:
            for si, i, embedding, in zip(batches_i, sents_i, embeddings):
                yield (si, i, embedding, [])
Esempio n. 13
0
def generate_answers(sess, model, dataset, rev_vocab):
    """
    Loop over the dev or test dataset and generate answer.

    Note: output format must be answers[uuid] = "real answer"
    You must provide a string of words instead of just a list, or start and end index

    In main() function we are dumping onto a JSON file

    evaluate.py will take the output JSON along with the original JSON file
    and output a F1 and EM

    You must implement this function in order to submit to Leaderboard.

    :param sess: active TF session
    :param model: a built QASystem model
    :param rev_vocab: this is a list of vocabulary that maps index to actual words
    :return:
    """
    answers = {}
    zipped = zip(*dataset)
    num_batches = (len(zipped) + FLAGS.batch_size - 1) / FLAGS.batch_size
    for i, batch in enumerate(get_minibatches(zipped, FLAGS.batch_size)):
        context_data, question_data, question_uuid_data, context_tokens = zip(
            *batch)
        p, q = [], []
        p_len, q_len = [], []
        for i in range(len(context_data)):
            q.append(question_data[i].split())
            q_len.append(
                min(FLAGS.question_size, len(question_data[i].split())))

            p.append(context_data[i].split())
            p_len.append(
                min(FLAGS.paragraph_size, len(context_data[i].split())))

        q = pad_sequences(q,
                          maxlen=FLAGS.question_size,
                          value=PAD_ID,
                          padding="post")
        p = pad_sequences(p,
                          maxlen=FLAGS.paragraph_size,
                          value=PAD_ID,
                          padding="post")

        ys, ye = model.predict_batch(sess, p, q, p_len, q_len)

        a_s_pred = np.argmax(ys, axis=1)
        a_e_pred = np.argmax(ye, axis=1)
        for i in range(len(context_data)):
            #predicted a_s and a_e
            s_pred = a_s_pred[i]
            e_pred = a_e_pred[i]

            uuid = question_uuid_data[i]
            pred_raw = ' '.join(context_tokens[i][s_pred:e_pred + 1])
            answers[uuid] = pred_raw
        print("Finished answering batch {} of {}".format(i + 1, num_batches))
    return answers
Esempio n. 14
0
def load_vte_dataset(nli_dataset_filename,
                     token2id,
                     label2id,
                     keep_neutrals=True):
    labels = []
    padded_premises = []
    padded_hypotheses = []
    image_names = []
    original_premises = []
    original_hypotheses = []

    with open(nli_dataset_filename) as in_file:

        reader = csv.reader(in_file, delimiter="\t")

        next(reader, None)  #skip header

        for row in reader:
            label = row[0].strip()

            if keep_neutrals == False and label == 'neutral':
                continue

            premise_tokens = row[1].strip().split()
            hypothesis_tokens = row[2].strip().split()
            image = row[3].strip().split("#")[0]

            premise = row[4].strip()
            hypothesis = row[5].strip()
            labels.append(label2id[label])

            padded_premises.append([
                token2id.get(token, token2id["#unk#"])
                for token in premise_tokens
            ])
            padded_hypotheses.append([
                token2id.get(token, token2id["#unk#"])
                for token in hypothesis_tokens
            ])
            image_names.append(image)
            original_premises.append(premise)
            original_hypotheses.append(hypothesis)

        padded_premises = pad_sequences(padded_premises,
                                        padding="post",
                                        value=token2id["#pad#"],
                                        dtype=np.long)
        padded_hypotheses = pad_sequences(padded_hypotheses,
                                          padding="post",
                                          value=token2id["#pad#"],
                                          dtype=np.long)
        labels = np.array(labels)

    return labels, padded_premises, padded_hypotheses, image_names, original_premises, original_hypotheses
def collate_examples(samples):
    words, labels = list(zip(*samples))

    seq_lengths = torch.LongTensor([len(s) for s in words])
    padded_words = pad_sequences(words, seq_lengths)
    padded_labels = pad_sequences(labels, seq_lengths)

    return (
        padded_words,
        padded_labels
    )
Esempio n. 16
0
 def _get_feed_dict(self, words, labels=None, lr=None, is_train=None):
     word_ids, char_ids = zip(*words)
     word_ids, seq_len = pad_sequences(word_ids, max_length=None, pad_tok=0, nlevels=1)
     feed_dict = {self.word_ids: word_ids, self.seq_len: seq_len}
     if self.cfg.use_char_emb:
         char_ids, word_len = pad_sequences(char_ids, max_length=None, pad_tok=0, max_length_2=None, nlevels=2)
         feed_dict[self.char_ids] = char_ids
         feed_dict[self.word_len] = word_len
     if labels is not None:
         feed_dict[self.labels] = labels
     if lr is not None:
         feed_dict[self.lr] = lr
     if is_train is not None:
         feed_dict[self.is_train] = is_train
     return feed_dict
Esempio n. 17
0
def init_data_provider(ngrams=False):
    logging.info('Data provider, initializing: ngrams = {}'.format(ngrams))
    logging.info('Data provider, loading file: ' + STATE['source'])
    with open(STATE['source']) as s:
        data = json.load(s)

    data['jokes'] = data['jokes'][:STATE['max_jokes']]

    random.shuffle(data['jokes'])

    logging.info('Data provider, extracting categories...')
    STATE['classes'] = extract_categories(data['jokes'], STATE['stemmer'])
    logging.info('Data provider, tokenizing data...')
    STATE['data'], STATE['tokenizer'] = \
        (get_data_as_ngrams if ngrams else get_data_as_bag_of_words)(data, STATE['stemmer'], STATE['classes'])

    X, Y = zip(*STATE['data'])
    STATE['X']['hot_vector'] = np.empty((len(X), STATE['tokenizer'].index))
    for i, e in enumerate(X):
        STATE['X']['hot_vector'][i] = to_hot_vector(e, STATE['tokenizer'].index)
    STATE['X']['sequential'] = utils.pad_sequences(X)
    STATE['Y']['categorical'] = np.array([to_categorical(y, STATE['classes'], STATE['stemmer']) for y in Y])
    STATE['Y']['numerical'] = np.array([to_numerical(y, STATE['classes'], STATE['stemmer']) for y in Y])

    STATE['model_params']['input_length']['hot_vector'] = len(STATE['X']['hot_vector'][0])
    STATE['model_params']['input_length']['sequential'] = len(STATE['X']['sequential'][0])
    STATE['model_params']['output_length']['categorical'] = len(STATE['Y']['categorical'][0])
    STATE['model_params']['output_length']['numerical'] = 1

    logging.info(
        'Data provider, finished loading [' + str(len(data['jokes'])) + ' jokes] from file: ' + STATE['source'])
Esempio n. 18
0
def collate_fn(batch):
    x, y = zip(*batch)

    x_lengths = torch.LongTensor([len(item) for item in x])
    padded_x = pad_sequences(x, x_lengths)

    return (padded_x, torch.FloatTensor(np.array(y)))
Esempio n. 19
0
def Tokenize_with_note_id_hour(df, max_length, tokenizer):
    labels = df.Label.values
    note_ids = df.Note_ID.values
    times = pd.to_datetime(df.charttime.values)
    times = times - times.min()
    times = times / pd.Timedelta(days=1)
    if 'TEXT' in df.columns:
        sen = df.TEXT.values
        labels = df.Label.values
        sen = ["[CLS] " + x + " [SEP]" for x in sen]
        tokenized_texts = [tokenizer.tokenize(x) for x in sen]
        print("First sentence tokenized")
        print(tokenized_texts[0])
        input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    else:
        assert 'Input_ID' in df.columns
        input_ids = df.Input_ID.apply(lambda x: x.split(' '))
        input_ids = input_ids.apply(lambda x: [int(i) for i in x])
        input_ids = input_ids.values
    input_ids = pad_sequences(input_ids, maxlen=max_length, dtype="long", truncating="post", padding="post")
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)
    return labels, input_ids, attention_masks, note_ids, times
Esempio n. 20
0
def collate_x(batch):
    x, y = zip(*batch)

    x_lengths = torch.LongTensor([len(item) for item in x])
    padded_x = pad_sequences(x, x_lengths)

    return padded_x, y
Esempio n. 21
0
def test(model, sess):
    reviews, ent2idx, attr2idx, polarity2idx = load_semeval_reviews(
        constants.test_filename)

    # list of (ids, ent, attr, pol)
    tuples = []
    for review in reviews:
        if len(review.tokens) <= 1:
            # ids = [0] + [word2idx[tok] for tok in review.tokens]
            ids = [word2idx[tok] for tok in review.tokens]
        else:
            ids = [word2idx[tok] for tok in review.tokens]
        tuples_ = [(ids, ent2idx[op.ent], attr2idx[op.attr],
                    polarity2idx[op.polarity]) for op in review.opinions]
        tuples.extend(tuples_)

    unzipped = zip(*tuples)
    ids = utils.pad_sequences(unzipped[0], maxlen=constants.max_sent_len)
    sent_lens = np.array(map(
        lambda x: len(x)
        if len(x) < constants.max_sent_len else constants.max_sent_len,
        unzipped[0]),
                         dtype='int32')
    ents, attrs, pols = (np.array(x, dtype='int32') for x in unzipped[1:])

    acc = model.eval(sess, ids, ents, attrs, sent_lens,
                     utils.to_categorical(pols))
    utils.log('test accuracy: {}'.format(acc), True)
Esempio n. 22
0
 def transform(self, X, one_host=False):
     if one_host is True:
         return self.sents_2_onehost(X)
     else:
         poses = self.sents_2_posid(X)
         poses, length = pad_sequences(poses, pad_tok=0)
         return np.array(poses, dtype=np.int32)
Esempio n. 23
0
    def _next_batch(self, dataset, batch_size):
        """

        :param dataset.Dataset dataset:
        :return:
        """
        start = 0
        while start < len(dataset.words):
            w_batch = dataset.words[start:start + batch_size]
            word_ids, _ = pad_sequences(w_batch,
                                        pad_tok=0,
                                        max_sent_length=self.max_length)

            if dataset.labels is not None:
                labels = dataset.labels[start:start + batch_size]
            else:
                labels = None

            start += batch_size
            yield {
                self.word_ids: word_ids,
                self.labels: labels
            } if labels is not None else {
                self.word_ids: word_ids
            }
Esempio n. 24
0
def load_ic_dataset(ic_dataset_filename, token2id, label2id):
    labels = []
    padded_sentences = []
    images_filenames = []
    original_sentences = []

    with open(ic_dataset_filename) as in_file:
        reader = csv.reader(in_file, delimiter="\t")

        for row in reader:
            # each row is of the form:
            # label \t sentence tokens \t image filename \t source \t original sentence
            label = row[0].strip()
            sentence_tokens = row[1].strip().split()
            image_filename = row[2].strip()
            sentence = row[5].strip()
            labels.append(label2id[label])
            padded_sentences.append([
                token2id.get(token, token2id["#unk#"])
                for token in sentence_tokens
            ])
            images_filenames.append(image_filename)
            original_sentences.append(sentence)

        padded_sentences = pad_sequences(padded_sentences,
                                         padding="post",
                                         value=token2id["#pad#"],
                                         dtype=np.long)
        labels = np.array(labels)

    return labels, padded_sentences, images_filenames, original_sentences
Esempio n. 25
0
def load_foil_dataset(filename, token2id, label2id):
    labels = []
    padded_sentences = []
    image_names = []

    with open(filename) as in_file:
        reader = csv.reader(in_file, delimiter="\t")

        for row in reader:
            label = row[0].strip()
            sentence_tokens = row[1].strip().split()
            image = row[2].strip().split("_")[2]
            labels.append(label2id[label])
            padded_sentences.append([
                token2id.get(token, token2id["#unk#"])
                for token in sentence_tokens
            ])
            image_names.append(image)

        padded_sentences = pad_sequences(padded_sentences,
                                         padding="post",
                                         value=token2id["#pad#"],
                                         dtype=np.long)
        labels = np.array(labels)

    return labels, padded_sentences, image_names
Esempio n. 26
0
    def get_feed(self, seqs, labels=None, lr=None, dropout=None):
        word_ids, seq_len_list = pad_sequences(seqs, pad_mark=0)

        feed_dict = {
            self.word_ids: word_ids,
            self.sequence_lengths: seq_len_list
        }
        if labels is not None:
            labels_, _ = pad_sequences(labels, pad_mark=0)
            feed_dict[self.labels] = labels_
        if lr is not None:
            feed_dict[self.lr_pl] = lr
        if dropout is not None:
            feed_dict[self.dropout_pl] = dropout

        return feed_dict, seq_len_list
Esempio n. 27
0
def _input_data(wavfiles):

    textfiles = [file.replace('.wav', '.txt') for file in wavfiles]
    audio = []
    audio_len = []
    transcript = []
    transcript_len = []

    for target_filename, audio_filename in zip(textfiles, wavfiles):

        Sxx = preprocess_audio_spectrogram(audio_filename)
        inputs_data = 20 * np.log10(Sxx).T.astype('float32')
        inputs_data = (inputs_data - np.mean(inputs_data, axis=0)) / np.std(
            inputs_data, axis=0)
        audio.append(inputs_data)
        audio_len.append(np.int32(len(inputs_data)))
        # Readings targets
        # load text transcription and convert to numerical array
        targets = normalize_txt_file(target_filename)
        targets = text_to_char_array(targets)
        transcript.append(targets)
        transcript_len.append(len(targets))

    audio = np.asarray(audio)
    audio_len = np.asarray(audio_len)
    transcript = np.asarray(transcript)
    transcript_len = np.asarray(transcript_len)

    train_inputs, train_seq_len = pad_sequences(audio)
    # Creating sparse representation to feed the placeholder
    train_targets = sparse_tuple_from(transcript)

    return train_inputs, train_targets, train_seq_len
Esempio n. 28
0
    def pad_sequence(self, word_ids, char_ids, labels=None):
        if labels:
            labels, _ = pad_sequences(labels, 0)
            labels = np.asarray(labels)
            labels = dense_to_one_hot(labels, len(self.vocab_tag), nlevels=2)

        word_ids, sequence_lengths = pad_sequences(word_ids, 0)
        word_ids = np.asarray(word_ids)

        if self.char_feature:
            char_ids, word_lengths = pad_sequences(char_ids,
                                                   pad_tok=0,
                                                   nlevels=2)
            char_ids = np.asarray(char_ids)
            return [word_ids, char_ids], labels
        else:
            return [word_ids], labels
Esempio n. 29
0
 def preprocess_input(self, state):
     new_input = []
     for l in state:
         new_input.append(letter_dict[l])
     state = pad_sequences([new_input], maxlen=self.maxlen)
     if self.is_training:
         self.episode_memory.append((state, self.get_guessed_mat()))
     return state, self.get_guessed_mat()
def collate_examples_multiple_tags(samples):
    examples, labels = list(zip(*samples))

    words = list()
    chars = list()
    bos = list()
    for e in examples:
        words.append(e[0])
        chars.append(e[1])
        bos.append(e[2])

    seq_lengths = torch.LongTensor([len(s) for s in words])
    padded_words = pad_sequences(words, seq_lengths)

    padded_chars = list()
    for char_list in chars:
        chars_seq_lengths = torch.LongTensor([len(s) for s in char_list])
        padded_chars.append(pad_sequences(char_list, chars_seq_lengths))

    padded_bos = list()
    for bos_list in bos:
        bos_seq_lengths = torch.LongTensor([len(s) for s in bos_list])
        padded_bos.append(pad_sequences(bos_list, bos_seq_lengths))

    tags_to_produce = set()
    for example in labels:
        tags_to_produce.update(example.keys())

    labels_splitted = defaultdict(list)
    for tag in tags_to_produce:
        for example in labels:
            if tag in example:
                labels_splitted[tag].append(example[tag])
            else:
                labels_splitted[tag].append([0])

    padded_labels = dict()
    for label, tags in labels_splitted.items():
        padded_labels[label] = pad_sequences(tags, seq_lengths)
        tags_to_produce.add(label)

    return (
        (padded_words, padded_chars, padded_bos, tags_to_produce),
        padded_labels
    )
Esempio n. 31
0
def select_model():
  """Use validation set to tune"""

  tr_texts, tr_labels = datareader.DirDataReader.read(
    os.path.join(base, cfg.get('data', 'train')),
    label2int)

  tr_texts, val_texts, tr_labels, val_labels = train_test_split(
    tr_texts, tr_labels, test_size=0.20, random_state=2020)

  tok = tokenizer.Tokenizer(cfg.getint('data', 'vocab_size'))
  tok.fit_on_texts(tr_texts)

  tr_texts = tok.texts_as_sets_to_seqs(tr_texts)
  val_texts = tok.texts_as_sets_to_seqs(val_texts)

  # todo: what's up with max length?

  train_loader = make_data_loader(
    utils.pad_sequences(tr_texts),
    tr_labels,
    cfg.getint('model', 'batch_size'),
    'train')
  val_loader = make_data_loader(
    utils.pad_sequences(val_texts),
    val_labels,
    cfg.getint('model', 'batch_size'),
    'dev')
  print('loaded %d training and %d validation samples' % \
        (len(tr_texts), len(val_texts)))

  model = TransformerClassifier()

  label_counts = torch.bincount(torch.tensor(tr_labels))
  weights = len(tr_labels) / (2.0 * label_counts)

  best_roc, optimal_epochs = fit(
    model,
    train_loader,
    val_loader,
    weights,
    cfg.getint('model', 'n_epochs'))
  print('roc auc %.3f after %d epochs' % (best_roc, optimal_epochs))

  return optimal_epochs
Esempio n. 32
0
 def getBatch(self, batchIndex):
     batchX = []
     batchY = []
     startKeyIndex = batchIndex * self.batchSize
     endKeyIndex = (batchIndex + 1) * self.batchSize
     for i in range(startKeyIndex, endKeyIndex, 1):
         j = i % self.keyList.__len__()
         if j == 0:
             self.completedEpoch += 1
             pass
         sentenceX = self.dataFile[self.keyList[j] + "/input"]
         sentenceY = self.dataFile[self.keyList[j] + "/label"]
         batchX.append(sentenceX)
         batchY.append(sentenceY)
         pass
     batchX, _ = pad_sequences(batchX, maxlen=self.maxTimeStep)
     batchY, _ = pad_sequences(batchY, maxlen=self.maxTimeStep)
     return (batchX, batchY)
    # Initializate the weights and biases
    tf.global_variables_initializer().run()


    for curr_epoch in range(num_epochs):
        train_cost = train_ler = 0
        start = time.time()

        for batch in range(num_batches_per_epoch):

            # Getting the index
            indexes = [i % num_examples for i in range(batch * batch_size, (batch + 1) * batch_size)]

            batch_train_inputs = train_inputs[indexes]
            # Padding input to max_time_step of this batch
            batch_train_inputs, batch_train_seq_len = pad_sequences(batch_train_inputs)

            # Converting to sparse representation so as to to feed SparseTensor input
            batch_train_targets = sparse_tuple_from(train_targets[indexes])

            feed = {inputs: batch_train_inputs,
                    targets: batch_train_targets,
                    seq_len: batch_train_seq_len}

            batch_cost, _ = session.run([cost, optimizer], feed)
            train_cost += batch_cost*batch_size
            train_ler += session.run(ler, feed_dict=feed)*batch_size


        # Shuffle the data
        shuffled_indexes = np.random.permutation(num_examples)