Beispiel #1
0
 def getVocab(self, vocab_path, max_document_length, filter_h_pad):
     if self.vocab_processor == None:
         print('locading vocab')
         vocab_processor = MyVocabularyProcessor(max_document_length -
                                                 filter_h_pad,
                                                 min_frequency=0)
         self.vocab_processor = vocab_processor.restore(vocab_path)
     return self.vocab_processor
Beispiel #2
0
 def getAquaintTestDataSet(self, data_path, vocab_path, max_document_length):
     x1_temp,x2_temp,y = self.getAquaintData(data_path)
     # Build vocabulary
     vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
     vocab_processor = vocab_processor.restore(vocab_path)
     print (len(vocab_processor.vocabulary_))
     x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
     x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
     # Randomly shuffle data
     del vocab_processor
     gc.collect()
     return x1,x2, y
Beispiel #3
0
    def getPCADataSet(self, data_path, vocab_path, max_document_length):
        x1_temp = self.getJsonPCAData(data_path)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print len(vocab_processor.vocabulary_)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x1, np.ones(len(x1))
Beispiel #4
0
    def getTestDataSet_infer(self, x1_infer, x2_infer, vocab_path, max_document_length):
        x1_temp,x2_temp = np.asarray(x1_infer), np.asarray(x2_infer)  #, = self.getTsvTestData_infer(x1_infer, x2_infer)
        #print('DAS ist x1_temp: ', type(x1_temp), x1_temp)
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print ('len vocab: ', len(vocab_processor.vocabulary_))

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x2
    def getTestDataSet(self, data_path, vocab_path, max_document_length):
        x1_temp,x2_temp,y = self.getTsvTestData(data_path)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print len(vocab_processor.vocabulary_)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x2, y
Beispiel #6
0
    def getTestDataSet(self, data_path, ent_path, vocab_path, max_document_length):
        x1_temp,x2_temp,y = self.getTsvTestData(data_path)
        ent_x1,ent_x2=self.getEntData(x1_temp,x2_temp, ent_path, max_document_length)
        add_fea_test = self.getAdditionalFeature(x1_temp,x2_temp)

        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print len(vocab_processor.vocabulary_)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1,x2,ent_x1,ent_x2, y, x1_temp, x2_temp,add_fea_test
Beispiel #7
0
    def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size, is_char_based, vocab_path):
        if is_char_based:
            x1_text, x2_text, y = self.getTsvDataCharBased(training_paths)
        else:
            x1_text, x2_text, y = self.getTsvData(training_paths)

        vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0, is_char_based=is_char_based)

        if os.path.exists(vocab_path):
            print("Loading vocab from: {}".format(vocab_path))
            vocab_processor = vocab_processor.restore(vocab_path)
        else:
            # Build vocabulary
            print("Building vocabulary")
            vocab_processor.fit_transform(np.concatenate((x2_text, x1_text), axis=0))
            print("Length of loaded vocabulary ={}".format(len(vocab_processor.vocabulary_)))
            # Write vocabulary
            vocab_processor.save(vocab_path)

        i1 = 0
        train_set = []
        dev_set = []
        x1 = np.asarray(list(vocab_processor.transform(x1_text)))
        x2 = np.asarray(list(vocab_processor.transform(x2_text)))
        # Randomly shuffle data
        np.random.seed(131)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x1_shuffled = x1[shuffle_indices]
        x2_shuffled = x2[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_idx = -1 * len(y_shuffled) * percent_dev // 100
        del x1
        del x2
        # Split train/test set
        self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
        # TODO: This is very crude, should use cross-validation
        x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
        x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
        y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
        print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev)))
        sum_no_of_batches = len(y_train) // batch_size
        train_set = (x1_train, x2_train, y_train)
        dev_set = (x1_dev, x2_dev, y_dev)
        gc.collect()
        return train_set, dev_set, vocab_processor, sum_no_of_batches
Beispiel #8
0
    def getWords(self, word1, word2, vocab_path, max_document_length):
        temp1 = []
        temp2 = []
        temp1.append(word1.lower())
        temp2.append(word2.lower())

        x1_temp = np.asarray(temp1)
        x2_temp = np.asarray(temp2)
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1, x2, np.asarray(-1)
    def toVocabularyIndexVector(self, datax1, datax2, vocab_path,
                                max_document_length):
        """
        Transform the word list to vocabulary_index vectors
        :param datax1:
        :param datax2:
        :param vocab_path:
        :param max_document_length:
        :return:
        """
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        print(len(vocab_processor.vocabulary_))

        datax1 = preprocess_arr(datax1)
        datax2 = preprocess_arr(datax2)
        x1 = np.asarray(list(vocab_processor.transform(datax1)))
        x2 = np.asarray(list(vocab_processor.transform(datax2)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1, x2
Beispiel #10
0
class InputHelper():
    def __init__(self,
                 data_dir,
                 input_file,
                 batch_size,
                 sequence_length,
                 is_train=True):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.sequence_length = sequence_length

        vocab_file = os.path.join(data_dir, 'useWords.model.vec')
        # vocab_file = os.path.join(data_dir, 'vocab.pkl')
        input_file = os.path.join(data_dir, input_file)

        if not (os.path.exists(vocab_file)):
            print 'readling train file'
            self.preprocess(input_file, vocab_file)
        else:
            print 'loading vocab file'
            self.load_vocab(vocab_file)

        if is_train:
            self.create_batches(input_file)
            self.reset_batch()

    def preprocess(self, input_file, vocab_file, min_freq=2):

        token_freq = defaultdict(int)

        for line in open(input_file):
            seq1, seq2, label = line.rstrip().split('\t')
            seq = seq1 + ' ' + seq2
            for token in seq.split(' '):
                token_freq[token] += 1

        token_list = [
            w for w in token_freq.keys() if token_freq[w] >= min_freq
        ]
        token_list.append('<pad>')
        token_dict = {token: index for index, token in enumerate(token_list)}

        with open(vocab_file, 'w') as f:
            cPickle.dump(token_dict, f)

        self.token_dictionary = token_dict
        self.vocab_size = len(self.token_dictionary)

    def load_vocab(self, vocab_file):
        self.token_dictionary = dict()
        for line in open(vocab_file):
            l = line.strip().split()
            st = l[0].decode('utf-8')
            self.token_dictionary[st] = np.asarray(l[1:])
            # self.vocab_size = len(self.token_dictionary)

            # with open(vocab_file, 'rb') as f:
            #    self.token_dictionary = cPickle.load(f)
            #    self.vocab_size = len(self.token_dictionary)

    def text_to_array(self, text, is_clip=True):

        words = [w for w in jieba.cut(text[0]) if w.strip()]
        words1 = [
            TOKENIZER_RE.findall(w)[0] for w in words
            if TOKENIZER_RE.findall(w)
        ]
        if is_clip:
            words1 = words1[:self.sequence_length]
        return words1

        # seq_ids = [int(self.token_dictionary.get(token)) for token in text if
        #            self.token_dictionary.get(token) is not None]
        # if is_clip:
        #    seq_ids = seq_ids[:self.sequence_length]
        # return seq_ids

    def getTsvData(self, filepath):
        print("Loading training data from " + filepath)
        x1 = []
        x2 = []
        y = []
        # positive samples from file
        for line in open(filepath):
            l = line.strip().split("\t")
            if len(l) < 3:
                continue
            if random() > 0.5:
                x1.append(l[1])
                x2.append(l[2])
            else:
                x1.append(l[2])
                x2.append(l[1])
            y.append(int(l[3]))
        return np.asarray(x1), np.asarray(x2), np.asarray(y)

    def getTsvTestData(self, filepath):
        print("Loading testing/labelled data from " + filepath)
        x1 = []
        x2 = []
        # positive samples from file
        for line in open(filepath):
            l = line.strip().split("\t")
            if len(l) < 2:
                continue
            x1.append(l[1])
            x2.append(l[2])
        return np.asarray(x1), np.asarray(x2)

    def getTestDataSet(self, data_path, vocab_path, max_document_length):
        x1_temp, x2_temp = self.getTsvTestData(data_path)

        # Build vocabulary
        self.vocab_processor = MyVocabularyProcessor(max_document_length,
                                                     min_frequency=0)
        self.vocab_processor = self.vocab_processor.restore(vocab_path)
        print len(self.vocab_processor.vocabulary_)

        x1 = np.asarray(list(self.vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(self.vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        return x1, x2

    def batch_iter(self, data, batch_size, num_epochs, shuffle=True):
        """
        Generates a batch iterator for a dataset.
        """
        data = np.asarray(data)
        print(data)
        print(data.shape)
        data_size = len(data)
        num_batches_per_epoch = int(len(data) / batch_size) + 1
        for epoch in range(num_epochs):
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    def padding_seq(self, seq_array, padding_index):

        for i in xrange(len(seq_array), self.sequence_length):
            seq_array.append(padding_index)

    def create_batches(self, text_file):

        x1 = []
        x2 = []
        y = []
        seq1_array = []
        seq2_array = []

        # padding_index = self.vocab_size - 1
        for line in open(text_file):
            _, seq1, seq2, label = line.rstrip().split('\t')

            # seq1_array = self.text_to_array(seq1.decode('utf-8').split(' '))
            # seq2_array = self.text_to_array(seq2.decode('utf-8').split(' '))

            # self.padding_seq(seq1_array, padding_index)
            # self.padding_seq(seq2_array, padding_index)

            label = int(label)
            x1.append(seq1)
            x2.append(seq2)
            y.append(label)

        self.vocab_processor = MyVocabularyProcessor(self.sequence_length,
                                                     min_frequency=0)
        self.vocab_processor.fit_transform(np.concatenate((x2, x1), axis=0))

        x1_1 = np.asarray(list(self.vocab_processor.transform(x1)))
        x2_1 = np.asarray(list(self.vocab_processor.transform(x2)))

        # x1 = np.array(x1)
        # x2 = np.array(x2)
        y = np.array(y)

        self.num_samples = len(y)
        self.num_batches = self.num_samples / self.batch_size
        indices = np.random.permutation(self.num_samples)
        self.x1 = x1_1[indices]
        self.x2 = x2_1[indices]
        self.y = y[indices]

    def next_batch(self):

        begin = self.pointer
        end = self.pointer + self.batch_size
        x1_batch = self.x1[begin:end]
        x2_batch = self.x2[begin:end]
        y_batch = self.y[begin:end]

        new_pointer = self.pointer + self.batch_size

        if new_pointer >= self.num_samples:
            self.eos = True
        else:
            self.pointer = new_pointer

        return x1_batch, x2_batch, y_batch

    def reset_batch(self):
        self.pointer = 0
        self.eos = False
 def getVocab(self,vocab_path, max_document_length,filter_h_pad):
     if self.vocab_processor==None:
         print('locading vocab')
         vocab_processor = MyVocabularyProcessor(max_document_length-filter_h_pad,min_frequency=0)
         self.vocab_processor = vocab_processor.restore(vocab_path)
     return self.vocab_processor
tf.flags.DEFINE_string("vocab_filepath", "runs/1541748108/checkpoints/vocab",
                       "Load training time vocabulary (Default: None)")
tf.flags.DEFINE_string("model", "runs/1541748108/checkpoints/model-33000",
                       "Load trained model checkpoint (Default: None)")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS

# Build vocabulary
vocab_processor = MyVocabularyProcessor(30, min_frequency=0)
vocab_processor = vocab_processor.restore(FLAGS.vocab_filepath)


def char2vec(arr):
    return np.asarray(list(vocab_processor.transform(arr)))


def get_test_data_set(text):
    return char2vec(np.full([len(abbr_vec_arr)], text))


def batch_iter(data, batch_size):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.asarray(data)