Ejemplo n.º 1
0
def loaddatawithtokenize(i = 0, nb_words = 20000, start_char = 1, oov_char=2, index_from=3, withraw = False, datalen = 500):
    (traindata,adversarialdata,testdata,numclass) = loaddata(i)
    rawtrain = traindata.content[:]
    rawadversarial = adversarialdata.content[:]
    rawtest = testdata.content[:]
    tokenizer = Tokenizer(lower=True)
    tokenizer.fit_on_texts(traindata.content + testdata.content)
    adversarialdata.content = tokenizer.texts_to_sequences(adversarialdata.content)
    traindata.content = tokenizer.texts_to_sequences(traindata.content)
    testdata.content  = tokenizer.texts_to_sequences(testdata.content)
    
    if start_char==None:
        adversarialdata.content = [[w + index_from for w in x] for x in adversarialdata.content]
        traindata.content = [[w + index_from for w in x] for x in traindata.content]
        testdata.content = [[w + index_from for w in x] for x in testdata.content]
    else:
        adversarialdata.content = [[start_char]+[w + index_from for w in x] for x in adversarialdata.content]
        traindata.content = [[start_char]+[w + index_from for w in x] for x in traindata.content]
        testdata.content = [[start_char]+[w + index_from for w in x] for x in testdata.content]

    adversarialdata.content = [[w if w < nb_words else oov_char for w in x] for x in adversarialdata.content]
    traindata.content = [[w if w < nb_words else oov_char for w in x] for x in traindata.content]
    testdata.content = [[w if w < nb_words else oov_char for w in x] for x in testdata.content]
    
    adversarialdata.content = pad_sequences(adversarialdata.content, maxlen=datalen)
    traindata.content = pad_sequences(traindata.content, maxlen=datalen)
    testdata.content = pad_sequences(testdata.content, maxlen=datalen)

    if withraw:
        return traindata,adversarialdata,testdata,tokenizer,numclass,rawtrain,rawadversarial,rawtest
    else:
        return traindata,adversarialdata,testdata,tokenizer,numclass
Ejemplo n.º 2
0
def predict_splitter(X,batchsize):
    X_bucket=[]
    data_set = [[] for _ in _buckets]
    tot_index=np.zeros(0)
    for i,x in enumerate(X):
        for b_id, _ in enumerate(_buckets):
            if len(x)<=_:
                data_set[b_id].append(i)
                break
    
    for b_id, _ in enumerate(data_set):
        if len(data_set[b_id])==0:
            continue
        batches=len(_)
        x_index=np.array(data_set[b_id])
        tot_index=np.concatenate([tot_index,x_index],axis=0)
        
        index=np.arange(batches)
        X_prime=X[x_index[index]]
        orf_prime=batchfindORF(X_prime)
        orf_prime=preprocessing.pad_sequences(orf_prime,maxlen=_buckets[b_id])
        orf_prime=(np.arange(orf_prime.max()+1) == orf_prime[:,:,None]).astype(dtype='float32')
        orf_prime=np.delete(orf_prime,0,axis=-1)
        X_prime=preprocessing.pad_sequences(X_prime,maxlen=_buckets[b_id])
        X_prime=(np.arange(X_prime.max()+1) == X_prime[:,:,None]).astype(dtype='float32') #one_hot
        X_prime=np.delete(X_prime,0,axis=-1)
        X_bucket.append([X_prime,orf_prime])
    
    
    return X_bucket,tot_index
Ejemplo n.º 3
0
def bucket_generator_ORF(X, Y, batchsize):
    data_set = [[] for _ in _buckets]
    for i, x in enumerate(X):
        for b_id, _ in enumerate(_buckets):
            if len(x) <= _:
                data_set[b_id].append(i)
                break
    k = 0
    len_set = [
        int(len(_) / batchsize) + ceil((len(_) % batchsize) / batchsize)
        for _ in data_set
    ]
    tot_batch = sum(len_set)
    while (1):
        if k % tot_batch == 0:
            k = 0
            shuffled_batch = np.arange(tot_batch)  #batches shuffle
            shuffled_data_set = []
            for data in data_set:
                b_data = data
                shuffle(b_data)
                shuffled_data_set.append(b_data)  #bucket shuffle

            np.random.shuffle(shuffled_batch)

        cur_batch = shuffled_batch[k]
        for s_i, l_b in enumerate(len_set):
            if cur_batch < l_b:
                batch_index = np.array(
                    shuffled_data_set[s_i][cur_batch *
                                           batchsize:(cur_batch + 1) *
                                           batchsize])
                X_batch = X[batch_index]
                orf_batch = batchfindORF(X_batch)
                orf_batch = preprocessing.pad_sequences(orf_batch,
                                                        maxlen=_buckets[s_i])
                orf_batch = (np.arange(orf_batch.max() +
                                       1) == orf_batch[:, :, None]).astype(
                                           dtype='float32')
                orf_batch = np.delete(orf_batch, 0, axis=-1)
                X_batch = preprocessing.pad_sequences(X_batch,
                                                      maxlen=_buckets[s_i])
                X_batch = (np.arange(X_batch.max() +
                                     1) == X_batch[:, :, None]).astype(
                                         dtype='float32')
                X_batch = np.delete(X_batch, 0, axis=-1)
                Y_batch = Y[batch_index]
                #Y_batch=(np.arange(2)==Y_batch).astype(dtype='float32')

                yield [X_batch, orf_batch], Y_batch
                break
            else:
                cur_batch -= l_b
        k += 1
Ejemplo n.º 4
0
def findORF(seq):
    orflen=0
    orf=""
    o_s=0
    o_e=0
    length=len(seq)
    seq=[seq]
    seq=preprocessing.pad_sequences(seq,maxlen=length,padding='post')
    seq=(np.arange(seq.max()+1) == seq[:,:,None]).astype(dtype='float32')
    seq=np.delete(seq,0,axis=-1)
    if seq.shape[2]==3:
        zeros_col = np.zeros((seq.shape[0],seq.shape[1],1))
        seq = np.concatenate((seq,zeros_col),axis=2)
    for frame in range(3):
        tseq=stopmodel.predict(seq[:,frame:])[:,:(length-frame)//3]
        tseq=np.argmax(tseq,axis=-1)-1
        sseq=np.append(-1,np.where(tseq==1)[1])
        sseq=np.append(sseq,tseq.shape[1])
        lseq=np.diff(sseq)-1
        flenp=np.argmax(lseq)
        flen=lseq[flenp]
        n_s=frame+3*sseq[flenp]+3
        n_e=frame+3*sseq[flenp+1]
        
        if flen>orflen or ((orflen==flen) and n_s<o_s):
            orflen=flen
            o_s=n_s
            o_e=n_e
                
    return o_s,o_e
Ejemplo n.º 5
0
    def __init__(self, opts):

        if opts.task == 'POS_models':
            data_dir = 'data/pos_data'
            self.kfold = False
        elif opts.jackknife:
            data_dir = 'data/super_data'
            jk_data_dir = 'data/pos_data'
            path_to_k_fold = os.path.join(jk_data_dir, 'train_y.txt')
            path_to_k_fold_test = os.path.join(jk_data_dir, 'test_y.txt')
            self.kfold = True

        else:
            data_dir = 'data/super_data'
            self.kfold = False

        path_to_text = os.path.join(data_dir, 'train_x.txt')
        path_to_text_test = os.path.join(data_dir, 'test_x.txt')
        path_to_POS = os.path.join(data_dir, 'train_y.txt')
        path_to_POS_test = os.path.join(data_dir, 'test_y.txt')

        self.MAX_NB_WORDS = 200000000000

        # first, build index mapping words in the embeddings set
        # to their embedding vector

        f_train = open(path_to_text)
        f_test = open(path_to_text_test)

        texts = f_train.readlines()
        nb_train_samples = len(texts)
        self.nb_train_samples = nb_train_samples
        texts = texts + f_test.readlines()

        f_train.close()
        f_test.close()

        print('length', len(texts))

        f_train.close()
        # f_test.close()

        # finally, vectorize the text samples into a 2D integer tensor
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        indicator = tokenizer.cap_indicator(texts)
        num_indicator = tokenizer.num_indicator(texts)
        suffix = tokenizer.suffix_extract(texts)
        suffix_tokenizer = Tokenizer()
        suffix_tokenizer.fit_on_texts(suffix, non_split=True)

        suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix,
                                                               non_split=True)
        # debugging
        #        for i in xrange(len(sequences)):
        #            assert len(sequences[i]) == len(suffix_sequences[i])

        word_index = tokenizer.word_index
        self.word_index = word_index

        suffix_index = suffix_tokenizer.word_index
        print('Found %s unique words.' % len(word_index))
        data = pad_sequences(sequences, opts, True)
        suffix_data = pad_sequences(suffix_sequences, opts)
        cap_indicator = pad_sequences(indicator, opts)
        num_indicator = pad_sequences(num_indicator, opts)

        f_train = open(path_to_POS)
        f_test = open(path_to_POS_test)
        texts = f_train.readlines() + f_test.readlines()
        f_train.close()
        f_test.close()
        lab_tokenizer = Tokenizer()
        lab_tokenizer.fit_on_texts(texts)
        lab_sequences = lab_tokenizer.texts_to_sequences(texts)
        tag_index = lab_tokenizer.word_index
        self.tag_index = tag_index
        self.tag_size = len(tag_index)
        print('Found %s unique tags.' % len(tag_index))
        labels = pad_sequences(lab_sequences, opts)
        #labels = np.expand_dims(labels, -1)  do not need it for tensorflow

        if opts.jackknife:

            f_train = open(path_to_k_fold)
            f_test = open(path_to_k_fold_test)
            texts = f_train.readlines() + f_test.readlines()
            f_train.close()
            f_test.close()
            jk_tokenizer = Tokenizer()
            jk_tokenizer.fit_on_texts(texts)
            jk_sequences = jk_tokenizer.texts_to_sequences(texts)
            jk_index = jk_tokenizer.word_index
            self.jk_index = jk_index
            self.jk_size = len(jk_index)
            print('Found %s unique jackknife tags.' % len(jk_index))
            jk_labels = pad_sequences(jk_sequences, opts)

        indices = np.arange(nb_train_samples)
        np.random.shuffle(indices)

        nb_validation_samples = data.shape[0] - nb_train_samples
        self.nb_validation_samples = nb_validation_samples
        ### define zero matrix first for splitting

        seq_length = labels.shape[1]
        if opts.attention in [100, 101, 102, 103]:
            self.nb_train_added = nb_train_samples // 10 * 10 + 10
            self.nb_validation_added = nb_validation_samples // 10 * 10 + 10
        else:
            self.nb_train_added = nb_train_samples
            self.nb_validation_added = nb_validation_samples

        self.X_train = np.zeros([self.nb_train_added, seq_length])

        self.X_train[:nb_train_samples] = data[:-nb_validation_samples][
            indices]
        if opts.jackknife:
            self.jk_labels = np.zeros([self.nb_train_added, seq_length])
            self.jk_labels[:nb_train_samples] = jk_labels[indices]
            self.jk_labels_test = np.zeros(
                [self.nb_validation_added, seq_length])
            self.jk_labels_test[:nb_validation_samples] = jk_labels[
                -nb_validation_samples:]

        self.train_cap_indicator = np.zeros([self.nb_train_added, seq_length])
        self.train_cap_indicator[:
                                 nb_train_samples] = cap_indicator[:
                                                                   -nb_validation_samples][
                                                                       indices]
        self.train_num_indicator = np.zeros([self.nb_train_added, seq_length])
        self.train_num_indicator[:
                                 nb_train_samples] = num_indicator[:
                                                                   -nb_validation_samples][
                                                                       indices]
        self.suffix_train = np.zeros([self.nb_train_added, seq_length])
        self.suffix_train[:
                          nb_train_samples] = suffix_data[:
                                                          -nb_validation_samples][
                                                              indices]
        self.y_train = np.zeros([self.nb_train_added, seq_length])
        self.y_train[:nb_train_samples] = labels[:-nb_validation_samples][
            indices]
        if opts.joint:
            self.pos_train = self.jk_labels
        self.X_test = np.zeros([self.nb_validation_added, seq_length])
        self.X_test[:nb_validation_samples] = data[-nb_validation_samples:]
        self.test_cap_indicator = np.zeros(
            [self.nb_validation_added, seq_length])
        self.test_cap_indicator[:nb_validation_samples] = cap_indicator[
            -nb_validation_samples:]
        self.test_num_indicator = np.zeros(
            [self.nb_validation_added, seq_length])
        self.test_num_indicator[:nb_validation_samples] = num_indicator[
            -nb_validation_samples:]
        self.suffix_test = np.zeros([self.nb_validation_added, seq_length])

        self.suffix_test[:nb_validation_samples] = suffix_data[
            -nb_validation_samples:]
        self.y_test = np.zeros([self.nb_validation_added, seq_length])
        self.y_test[:nb_validation_samples] = labels[-nb_validation_samples:]
        if opts.joint:
            self.pos_test = self.jk_labels_test

        if opts.jackknife:
            K = 10
            #k_fold_samples = nb_train_samples//K*K
            samples_per_group = (nb_train_samples // K) + 1

            print('splitting into {} folds'.format(K))

            ## don't get rid of the remainders. We will save all of them

            #            self.X_train = self.X_train[:k_fold_samples] # get rid of the remaining examples for kfold
            #            self.train_cap_indicator = self.train_cap_indicator[:k_fold_samples]
            #            self.train_num_indicator = self.train_num_indicator[:k_fold_samples]
            #            self.suffix_train = self.suffix_train[:k_fold_samples]
            #
            #            self.X_train_k_fold = np.split(self.X_train[:k_fold_samples], K)
            #            self.train_cap_indicator_k_fold = np.split(self.train_cap_indicator[:k_fold_samples], K)
            #            self.train_num_indicator_k_fold = np.split(self.train_num_indicator[:k_fold_samples], K)
            #            self.suffix_train_k_fold = np.split(self.suffix_train[:k_fold_samples], K)
            #            self.y_train_k_fold = np.split(self.jk_labels[:k_fold_samples], K)
            #
            # adding everything back

            self.X_train_k_fold = []
            self.train_cap_indicator_k_fold = []
            self.train_num_indicator_k_fold = []
            self.suffix_train_k_fold = []
            self.y_train_k_fold = []

            for k in xrange(K):

                self.X_train_k_fold.append(
                    self.X_train[samples_per_group * k:samples_per_group *
                                 (k + 1)])
                self.train_cap_indicator_k_fold.append(
                    self.train_cap_indicator[samples_per_group *
                                             k:samples_per_group * (k + 1)])
                self.train_num_indicator_k_fold.append(
                    self.train_num_indicator[samples_per_group *
                                             k:samples_per_group * (k + 1)])
                self.suffix_train_k_fold.append(
                    self.suffix_train[samples_per_group * k:samples_per_group *
                                      (k + 1)])
                self.y_train_k_fold.append(
                    self.jk_labels[samples_per_group * k:samples_per_group *
                                   (k + 1)])
                #if opts.joint:
                #    self.pos_train = self.pos_train[:k_fold_samples]

                print('end splitting')

        self.nb_suffix = len(suffix_index)
        self.suffix_embedding_mat = np.random.randn(self.nb_suffix + 1, 10)
        self.nb_words = min(self.MAX_NB_WORDS, len(word_index))
        ## cond entropy
        self.cond_matrix = np.ones((self.nb_words + 1, 1))
        with open('certain.pkl') as fhand:
            certain = pickle.load(fhand)
        self.certain_words = []
        for certain_word in certain:
            self.certain_words.append(self.word_index[certain_word])
        for certain_word in self.certain_words:
            self.cond_matrix[certain_word] = 0.0
        ### cond entropy ends
        if opts.embedding_name == 'random':
            np.random.seed(opts.seed)
            self.embedding_matrix = np.random.uniform(
                -2, 2, size=(self.nb_words + 1, opts.embedding_dim))
        elif opts.embedding_name == 'word2vec':
            if not opts.embedding_dim == 300:  # word2vec is of 300 dim
                sys.exit('error in dim')
            filename = os.path.join('../word2vec',
                                    'GoogleNews-vectors-negative300.bin')

            import gensim

            self.embedding_matrix = np.zeros(
                (self.nb_words + 1, opts.embedding_dim))
            self.word2vec_model = gensim.models.word2vec.Word2Vec.load_word2vec_format(
                filename, binary=True)
            print('Found %s word vectors.' % len(self.word2vec_model.vocab))
            for word, i in word_index.items():
                if i > self.MAX_NB_WORDS and word in self.word2vec_model.vocab:
                    self.embedding_matrix[i] = self.word2vec_model[word]
        else:
            self.embeddings_index = {}
            print('Indexing word vectors.')
            f = open(opts.embedding_name)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
            f.close()

            print('Found %s word vectors.' % len(self.embeddings_index))

            self.embedding_matrix = np.zeros(
                (self.nb_words + 1, opts.embedding_dim))
            for word, i in word_index.items():
                if i > self.MAX_NB_WORDS:
                    continue
                embedding_vector = self.embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    if not self.embedding_matrix.shape[1] == len(
                            embedding_vector):
                        sys.exit('error in dim')

                    self.embedding_matrix[i] = embedding_vector

        # load pre-trained word embeddings into an Embedding layer

        self._index_in_epoch = 0
        self._num_examples = self.X_train.shape[0]
        self._num_test_examples = self.X_test.shape[0]
        self._epoch_completed = 0
        self._index_in_test = 0
        if opts.jackknife:
            #            self._num_hold_in_examples = self.X_train_k_fold[0].shape[0]*(K-1)
            #            self._num_hold_out_examples = self.X_train_k_fold[0].shape[0]
            self.k = 0
        self.opts = opts
train_label = train_data_df.iloc[:, -1].values
dev_data = dev_data_df.iloc[:, -2].values
dev_label = dev_data_df.iloc[:, -1].values
test_data = test_data_df.iloc[:, -1].values

# 获取词典与词向量
pretrained_embedding_file_path = base_path + "/glove/glove.6B.50d.txt"
word2idx, embedding_matrix = load_pretrained_embedding(
    pretrained_embedding_file_path=pretrained_embedding_file_path)

# 文本向量化
train_data = texts_convert_to_ids(train_data, word2idx)
dev_data = texts_convert_to_ids(dev_data, word2idx)
test_data = texts_convert_to_ids(test_data, word2idx)

train_data = torch.from_numpy(pad_sequences(train_data))
dev_data = torch.from_numpy(pad_sequences(dev_data))
test_data = torch.from_numpy(pad_sequences(test_data))


# 产生batch data
class my_dataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        item_data = self.data[item]
Ejemplo n.º 7
0
        print('Epoch {}'.format(epoch))
        training_loss = 0.0

        predicted_values_train = []
        true_values_train = []

        predicted_values_test = []
        true_values_test = []

        for batch_idx, ex in enumerate(train_data):
            tokenized_tweets = tokenize_tweets2(ex['tokenized_tweet'],
                                                text_preprocessor)

            indexed_labels = [labels_dict[i] for i in ex['label']]

            tensor_dictionary = pad_sequences(indexed_labels, tokenized_tweets,
                                              dictionary)

            data = tensor_dictionary['tweet_tensor'].cuda()
            label = tensor_dictionary['label_tensor'].cuda()
            model.cuda().train()

            predictions = model(data, tensor_dictionary['length'])
            predictions = F.log_softmax(predictions, dim=1)

            #indexed_labels = torch.LongTensor([labels_dict[i] for i in ex['label']]).cuda()
            loss = F.nll_loss(predictions, label)

            training_loss += loss.data
            # optimizer = optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=0.001,  momentum=0.9, nesterov = True)
            optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                          model.parameters()),
    def __init__(self, opts, test_opts=None):
        path_to_text = opts.text_train
        path_to_tag = opts.tag_train
        path_to_jk = opts.jk_train
        if test_opts is None:
            path_to_text_test = opts.text_test
            path_to_tag_test = opts.tag_test
            path_to_jk_test = opts.jk_test
        else:
            path_to_text_test = test_opts.text_test
            path_to_tag_test = test_opts.tag_test
            path_to_jk_test = test_opts.jk_test

        self.inputs_train = {}
        self.inputs_test = {}

        ## indexing sents files
        f_train = io.open(path_to_text, encoding='utf-8')
        texts = f_train.readlines()
        self.nb_train_samples = len(texts)
        f_train.close()
        tokenizer = Tokenizer(lower=True)
        tokenizer.fit_on_texts(texts)
        #print(tokenizer.word_index['-unseen-'])
        self.word_index = tokenizer.word_index
        sorted_freqs = tokenizer.sorted_freqs
        self.nb_words = len(self.word_index)
        print('Found {} unique lowercased words including -unseen-.'.format(self.nb_words))

        # lookup the glove word embeddings
        # need to reserve indices for testing file. 
        glove_size = opts.embedding_dim
        self.embeddings_index = {}
        print('Indexing word vectors.')
        #f = open('glovevector/glove.6B.{}d.txt'.format(glove_size))
        f = io.open(opts.word_embeddings_file, encoding='utf-8')
        for line in f:
            values = line.strip().split(' ')
            if len(values) == opts.embedding_dim+1:
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
        f.close()

        print('Found {} word vectors.'.format(len(self.embeddings_index)))

        unseens = list(set(self.embeddings_index.keys()) - set(self.word_index.keys())) ## list of words that appear in glove but not in the training set
        nb_unseens = len(unseens)
        print('Found {} words not in the training set'.format(nb_unseens))

        self.word_embeddings = np.zeros((self.nb_words+1+nb_unseens, glove_size)) ## +1 for padding (idx 0)

        ## Get Frequencies for Adversarial Training (Yasunaga et al. 2017)
        self.word_freqs = np.zeros([self.nb_words+1+nb_unseens])
        self.word_freqs[1:self.nb_words] = sorted_freqs ## Skip Zero Padding (Index 0)
        self.word_freqs = self.word_freqs.astype(np.float32)
        self.word_freqs = self.word_freqs/np.sum(self.word_freqs)
        for word, i in self.word_index.items(): ## first index the words in the training set
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None: ## otherwise zero vector
                self.word_embeddings[i] = embedding_vector
        for unseen in unseens:
            self.word_index[unseen] = len(self.word_index) + 1 ## add unseen words to the word_index dictionary
            self.word_embeddings[self.word_index[unseen]] = self.embeddings_index[unseen]
        self.idx_to_word = invert_dict(self.word_index)
        print('end glove indexing')
        f_test = io.open(path_to_text_test, encoding='utf-8')
        texts = texts +  f_test.readlines()
        self.nb_validation_samples = len(texts) - self.nb_train_samples
        f_test.close()
        text_sequences = tokenizer.texts_to_sequences(texts)
        #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples]))
        self.inputs_train['words'] = text_sequences[:self.nb_train_samples]
        self.inputs_test['words'] = text_sequences[self.nb_train_samples:]
        ## indexing sents files ends
        ## indexing suffixes 
        if opts.suffix_dim > 0:
            suffix = tokenizer.suffix_extract(texts)
            suffix_tokenizer = Tokenizer()
            suffix_tokenizer.fit_on_texts(suffix[:self.nb_train_samples], non_split=True)
            self.suffix_index = suffix_tokenizer.word_index
            self.nb_suffixes = len(self.suffix_index)
            sorted_freqs = suffix_tokenizer.sorted_freqs
            self.suffix_freqs = np.zeros([self.nb_suffixes+1]).astype(np.float32) ## +1 for zero padding
            self.suffix_freqs[1:self.nb_suffixes] = sorted_freqs ## Skip Zero Padding (Index 0)
            self.suffix_freqs = self.suffix_freqs/np.sum(self.suffix_freqs)
            self.idx_to_suffix = invert_dict(self.suffix_index)
            print('Found {} unique suffixes including -unseen-.'.format(self.nb_suffixes))
            suffix_sequences = suffix_tokenizer.texts_to_sequences(suffix, non_split=True)
            #print(map(lambda x: self.idx_to_suffix[x], suffix_sequences[self.nb_train_samples]))
            self.inputs_train['suffix'] = suffix_sequences[:self.nb_train_samples]
            self.inputs_test['suffix'] = suffix_sequences[self.nb_train_samples:]
            ## indexing suffixes ends
        ## indexing capitalization 
        if opts.cap:
            cap_sequences = tokenizer.cap_indicator(texts)
            #print(cap_sequences[self.nb_train_samples])
            self.inputs_train['cap'] = cap_sequences[:self.nb_train_samples]
            self.inputs_test['cap'] = cap_sequences[self.nb_train_samples:]
            ## indexing capitalization ends
            ## indexing numbers
        if opts.num:
            num_sequences = tokenizer.num_indicator(texts)
            #print(num_sequences[self.nb_train_samples])
            self.inputs_train['num'] = num_sequences[:self.nb_train_samples]
            self.inputs_test['num'] = num_sequences[self.nb_train_samples:]
            ## indexing numbers ends
        ## indexing jackknife files
        if opts.jk_dim > 0:
            f_train = io.open(path_to_jk, encoding='utf-8')
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False) 
            tokenizer.fit_on_texts(texts)
            self.jk_index = tokenizer.word_index
            self.nb_jk = len(self.jk_index)
            sorted_freqs = tokenizer.sorted_freqs
            self.jk_freqs = np.zeros([self.nb_jk+1]).astype(np.float32) ## +1 for zero padding
            self.jk_freqs[1:self.nb_jk] = sorted_freqs ## Skip Zero Padding (Index 0)
            self.jk_freqs = self.jk_freqs/np.sum(self.jk_freqs)
            self.idx_to_jk = invert_dict(self.jk_index)
            print('Found {} unique tags including -unseen-.'.format(self.nb_jk))
            f_test = io.open(path_to_jk_test, encoding='utf-8')
            texts = texts + f_test.readlines() ## do not lowercase tCO
            f_test.close()
            jk_sequences = tokenizer.texts_to_sequences(texts)
            #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples]))
            self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples]
            self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:]
            ## indexing jackknife files ends
        ## indexing char files
        if opts.chars_dim > 0:
            f_train = io.open(path_to_text, encoding='utf-8')
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False,char_encoding=True) 
            tokenizer.fit_on_texts(texts)
            self.char_index = tokenizer.word_index
            self.nb_chars = len(self.char_index)
            sorted_freqs = tokenizer.sorted_freqs
            self.char_freqs = np.zeros([self.nb_chars+1]).astype(np.float32) ## +1 for zero padding
            self.char_freqs[1:self.nb_chars] = sorted_freqs ## Skip Zero Padding (Index 0)
            self.char_freqs = self.char_freqs/np.sum(self.char_freqs)
            self.idx_to_char = invert_dict(self.char_index)
            print('Found {} unique characters including -unseen-.'.format(self.nb_chars))
            f_test = io.open(path_to_text_test, encoding='utf-8')
            texts = texts + f_test.readlines() ## do not lowercase tCO
            f_test.close()
            char_sequences = tokenizer.texts_to_sequences(texts)
            #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples]))
            self.inputs_train['chars'] = char_sequences[:self.nb_train_samples]
            self.inputs_test['chars'] = char_sequences[self.nb_train_samples:]
            ## indexing char files ends
        ## indexing stag files
        f_train = open(path_to_tag)
        texts = f_train.readlines()
        f_train.close()
        tokenizer = Tokenizer(lower=False) ## for tCO
        tokenizer.fit_on_texts(texts, zero_padding=False)
        #print(tokenizer.word_index['-unseen-'])
        self.tag_index = tokenizer.word_index
        self.nb_tags = len(self.tag_index)
        self.idx_to_tag = invert_dict(self.tag_index)
        print('Found {} unique tags including -unseen-.'.format(self.nb_tags))
        f_test = open(path_to_tag_test)
        texts = texts + f_test.readlines() ## do not lowercase tCO
        f_test.close()
        tag_sequences = tokenizer.texts_to_sequences(texts)
        #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8]))
        self.inputs_train['tags'] = tag_sequences[:self.nb_train_samples]
        self.inputs_test['tags'] = tag_sequences[self.nb_train_samples:]

        ## indexing stag files ends
        self.test_gold = np.hstack(tag_sequences[self.nb_train_samples:]) ## for calculation of accuracy
        ## padding the train inputs and test inputs
        #self.inputs_train = [pad_sequences(x) for x in self.inputs_train]
        self.inputs_train = {key: pad_sequences(x, key) for key, x in self.inputs_train.items()}
        random.seed(0)
        perm = np.arange(self.nb_train_samples)
        random.shuffle(perm)
        self.inputs_train = {key: x[perm] for key, x in self.inputs_train.items()}
        #self.inputs_train = [x[perm] for x in self.inputs_train]

        #self.inputs_test = [pad_sequences(x) for x in self.inputs_test]
        self.inputs_test = {key: pad_sequences(x, key) for key, x in self.inputs_test.items()}

        ## setting the current indices
        self._index_in_epoch = 0
        self._epoch_completed = 0
        self._index_in_test = 0
Ejemplo n.º 9
0
        trained_model = evaluate_model.compile_train(nmt_model, train_english_input, train_german_output, test_english_input, test_german_output)
        
        
        #evaluate model
        evaluate_model.model_speech_evaluation(trained_model, german_tokenizer, train_english_input, train, role='Train')
        
        evaluate_model.model_speech_evaluation(trained_model, german_tokenizer, test_english_input, test, role='Test')
        '''

    #------------------------------------------------------------------------------------------------------------------------------------> GERMAN TO ENGLISH
    #prepare train data
    train_english_output = preprocessing.encode_sequences(
        english_tokenizer, train[:, 0])
    #print(train_english_input)

    train_english_output = preprocessing.pad_sequences(
        english_max_sentence_length, train_english_output)
    #print(train_english_input)

    train_german_input = preprocessing.encode_sequences(
        german_tokenizer, train[:, 1])
    train_german_input = preprocessing.pad_sequences(
        german_max_sentence_length, train_german_input)

    #make the target as an one hot encoding
    #train_english_output = preprocessing.oneHotEncoding(train_english_output, english_vocabulary_size)

    #and one for english
    #train_english_output = oneHotEncoding(train_english_input, english_vocabulary_size)

    #print(train_german_output)
    #print(train_german_output[0].shape)
    def __init__(self, opts, test_opts=None):
        path_to_text = opts.text_train
        path_to_tag = opts.tag_train
        path_to_jk = opts.jk_train
        path_to_arc = opts.arc_train
        path_to_rel = opts.rel_train
        if test_opts is None:
            path_to_text_test = opts.text_test
            path_to_tag_test = opts.tag_test
            path_to_jk_test = opts.jk_test
            path_to_arc_test = opts.arc_test
            path_to_rel_test = opts.rel_test
            path_to_punc_test = opts.punc_test
        else:
            path_to_text_test = test_opts.text_test
            path_to_tag_test = test_opts.tag_test
            path_to_jk_test = test_opts.jk_test
            path_to_arc_test = test_opts.arc_test
            path_to_rel_test = test_opts.rel_test
            path_to_punc_test = test_opts.punc_test

        self.inputs_train = {}
        self.inputs_test = {}

        ## indexing sents files
        f_train = open(path_to_text)
        texts = f_train.readlines()
        self.nb_train_samples = len(texts)
        f_train.close()
        tokenizer = Tokenizer(lower=True)
        tokenizer.fit_on_texts(texts)
        #print(tokenizer.word_index['-unseen-'])
        self.word_index = tokenizer.word_index
        self.nb_words = len(self.word_index)
        print(
            'Found {} unique lowercased words including -unseen- and <-root->.'
            .format(self.nb_words))

        # lookup the glove word embeddings
        # need to reserve indices for testing file.
        glove_size = opts.embedding_dim
        self.embeddings_index = {}
        print('Indexing word vectors.')
        f = open(opts.word_embeddings_file)
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()

        print('Found {} word vectors.'.format(len(self.embeddings_index)))

        unseens = list(
            set(self.embeddings_index.keys()) - set(self.word_index.keys())
        )  ## list of words that appear in glove but not in the training set
        nb_unseens = len(unseens)
        print('Found {} words not in the training set but in the glove data'.
              format(nb_unseens))

        self.word_embeddings = np.zeros(
            (self.nb_words + 1 + nb_unseens,
             glove_size))  ## +1 for padding (idx 0)
        for word, i in self.word_index.items(
        ):  ## first index the words in the training set
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:  ## otherwise zero vector
                self.word_embeddings[i] = embedding_vector
        for unseen in unseens:
            self.word_index[unseen] = len(
                self.word_index
            ) + 1  ## add unseen words to the word_index dictionary
            self.word_embeddings[
                self.word_index[unseen]] = self.embeddings_index[unseen]
        self.idx_to_word = invert_dict(self.word_index)
        print('end glove indexing')
        f_test = open(path_to_text_test)
        texts = texts + f_test.readlines()
        self.nb_validation_samples = len(texts) - self.nb_train_samples
        f_test.close()
        text_sequences = tokenizer.texts_to_sequences(texts)
        #print(map(lambda x: self.idx_to_word[x], text_sequences[self.nb_train_samples]))
        self.inputs_train['words'] = text_sequences[:self.nb_train_samples]
        self.inputs_test['words'] = text_sequences[self.nb_train_samples:]
        ## indexing sents files ends
        ## indexing char files
        if opts.chars_dim > 0:
            f_train = io.open(path_to_text, encoding='utf-8')
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False, char_encoding=True, root=False)
            ## char embedding for <-root-> does not make sense
            tokenizer.fit_on_texts(
                texts)  ## char embedding for <-root-> does not make sense
            self.char_index = tokenizer.word_index
            self.nb_chars = len(self.char_index)
            self.idx_to_char = invert_dict(self.char_index)
            print(
                'Found {} unique characters including -unseen-. NOT including <-root->.'
                .format(self.nb_chars))
            f_test = io.open(path_to_text_test, encoding='utf-8')
            texts = texts + f_test.readlines()  ## do not lowercase tCO
            f_test.close()
            char_sequences = tokenizer.texts_to_sequences(texts)
            #print(map(lambda x: self.idx_to_jk[x], jk_sequences[self.nb_train_samples]))
            self.inputs_train['chars'] = char_sequences[:self.nb_train_samples]
            self.inputs_test['chars'] = char_sequences[self.nb_train_samples:]
            ## indexing char files ends

        ## indexing jackknife files
        if (opts.jk_dim > 0) or (opts.model in ['Parsing_Model_Joint_Both']):
            f_train = open(path_to_jk)
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False)
            tokenizer.fit_on_texts(texts, zero_padding=False)
            self.jk_index = tokenizer.word_index
            self.nb_jk = len(self.jk_index)
            self.idx_to_jk = invert_dict(self.jk_index)
            print('Found {} unique POS tags including -unseen- and <-root->.'.
                  format(self.nb_jk))
            f_test = open(path_to_jk_test)
            texts = texts + f_test.readlines()  ## do not lowercase tCO
            f_test.close()
            jk_sequences = tokenizer.texts_to_sequences(texts)
            self.inputs_train['jk'] = jk_sequences[:self.nb_train_samples]
            self.inputs_test['jk'] = jk_sequences[self.nb_train_samples:]
            self.gold_jk = np.hstack(
                map(lambda x: x[1:], jk_sequences[self.nb_train_samples:]))
            ## indexing jackknife files ends
        ## indexing stag files
        if (opts.stag_dim > 0) or (opts.model in [
                'Parsing_Model_Joint', 'Parsing_Model_Shuffle',
                'Parsing_Model_Joint_Both'
        ]):
            f_train = open(path_to_tag)
            texts = f_train.readlines()
            f_train.close()
            tokenizer = Tokenizer(lower=False)  ## for tCO
            tokenizer.fit_on_texts(texts, zero_padding=False)
            ## if zero_padding is True, index 0 is reserved, never assigned to an existing word
            self.tag_index = tokenizer.word_index
            self.nb_stags = len(self.tag_index)
            self.idx_to_tag = invert_dict(self.tag_index)
            print('Found {} unique supertags including -unseen- and <-root->.'.
                  format(self.nb_stags))
            f_test = open(path_to_tag_test)
            texts = texts + f_test.readlines()  ## do not lowercase tCO
            f_test.close()
            tag_sequences = tokenizer.texts_to_sequences(texts)
            #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8]))
            self.inputs_train['stags'] = tag_sequences[:self.nb_train_samples]
            self.inputs_test['stags'] = tag_sequences[self.nb_train_samples:]
            self.gold_stags = np.hstack(
                map(lambda x: x[1:], tag_sequences[self.nb_train_samples:]))
            ## indexing stag files ends

        ## indexing rel files
        f_train = open(path_to_rel)
        texts = f_train.readlines()
        f_train.close()
        tokenizer = Tokenizer(lower=False)
        tokenizer.fit_on_texts(texts, zero_padding=False)
        self.rel_index = tokenizer.word_index
        self.nb_rels = len(self.rel_index)
        self.idx_to_rel = invert_dict(self.rel_index)
        print(
            'Found {} unique rels including -unseen-, NOT including <-root->.'.
            format(self.nb_rels))
        f_test = open(path_to_rel_test)
        texts = texts + f_test.readlines()  ## do not lowercase tCO
        f_test.close()
        rel_sequences = tokenizer.texts_to_sequences(texts)
        #print(map(lambda x: self.idx_to_tag[x], tag_sequences[self.nb_train_samples+8]))
        self.inputs_train['rels'] = rel_sequences[:self.nb_train_samples]
        self.inputs_test['rels'] = rel_sequences[self.nb_train_samples:]
        self.gold_rels = np.hstack(
            map(lambda x: x[1:], rel_sequences[self.nb_train_samples:]))
        ## indexing rel files ends

        ## indexing arc files
        ## Notice arc sequences are already integers
        f_train = open(path_to_arc)
        arc_sequences = f_train.readlines()
        f_train.close()
        f_test = open(path_to_arc_test)
        arc_sequences = arcs2seq(arc_sequences + f_test.readlines())
        f_test.close()
        self.inputs_train['arcs'] = arc_sequences[:self.nb_train_samples]
        self.inputs_test['arcs'] = arc_sequences[self.nb_train_samples:]
        ## indexing arc files ends
        self.gold_arcs = np.hstack(arc_sequences[self.nb_train_samples:])
        if path_to_punc_test is not None:
            self.punc = arc_sequences[self.nb_train_samples:]
            with open(path_to_punc_test) as fhand:
                for sent_idx, line in zip(xrange(len(self.punc)), fhand):
                    self.punc[sent_idx] = [
                        True for _ in xrange(len(self.punc[sent_idx]))
                    ]
                    for punc_idx in map(int, line.split()):
                        self.punc[sent_idx][punc_idx - 1] = False
            self.punc = np.hstack(self.punc)  #.astype(bool)

        ## padding the train inputs and test inputs
        self.inputs_train = {
            key: pad_sequences(x, key)
            for key, x in self.inputs_train.items()
        }
        self.inputs_train['arcs'] = np.hstack([
            np.zeros([self.inputs_train['arcs'].shape[0], 1]).astype(int),
            self.inputs_train['arcs']
        ])
        ## dummy parents for the roots
        random.seed(0)
        perm = np.arange(self.nb_train_samples)
        random.shuffle(perm)
        self.inputs_train = {
            key: x[perm]
            for key, x in self.inputs_train.items()
        }

        self.inputs_test = {
            key: pad_sequences(x, key)
            for key, x in self.inputs_test.items()
        }
        ## dummy parents for the roots
        self.inputs_test['arcs'] = np.hstack([
            np.zeros([self.inputs_test['arcs'].shape[0], 1]).astype(int),
            self.inputs_test['arcs']
        ])

        ## padding ends

        ## setting the current indices
        self._index_in_epoch = 0
        self._epoch_completed = 0
        self._index_in_test = 0