コード例 #1
0
def build_evaluate_data(lines, tid=0):
    with open('worddata/word_dict.pkl', 'rb') as f:
        word_dict = pickle.load(f)

    def word2id(c):
        if c in word_dict:
            return word_dict[c]
        else:
            return 0

    cnt = 0
    history = []
    true_utt = []
    for line in lines:
        fields = line.rstrip().lower().split('\t')
        utterance = fields[-1].split('###')
        history.append([
            list(map(word2id, text_to_word_sequence(each_utt)))
            for each_utt in utterance
        ])
        true_utt.append(list(map(word2id, text_to_word_sequence(fields[0]))))
        cnt += 1
        if cnt % 10000 == 0:
            print(tid, cnt)
    return history, true_utt
コード例 #2
0
def hierarchical_tokenize_and_pad(data, tokenizer=None, max_sequence_len=200, max_sequences=20,
                                  enforce_max_len=False, filter_words=False):
    """

    :param data:
    :param tokenizer:
    :param max_sequence_len:
    :param max_sequences:
    :param enforce_max_len:
    :param filter_words:
    :return:
    """
    temp_data = list()
    for seq in data[:,0]:
        temp_data.append(' '.join(seq.split()))
    if tokenizer is None:
        tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\]^_`{|}~', lower=True)
        tokenizer.fit_on_texts(temp_data)

    raw_data = list()
    max_sequences_actual = -1
    max_sequence_len_actual = -1
    for seq in data[:, 0]:
        sentences = nltk.tokenize.sent_tokenize(seq)
        raw_data.append(sentences)
        max_sequences_actual = max(len(sentences), max_sequences_actual)
        for sentence in sentences:
            word_tokens = text_to_word_sequence(sentence, filters='!"#$%&()*+,./:;<=>?@[\]^_`{|}~', lower=True)
            max_sequence_len_actual = max(len(word_tokens), max_sequence_len_actual)

    if not enforce_max_len:
            max_sequence_len = min(max_sequence_len, max_sequence_len_actual)
            max_sequences = min(max_sequences, max_sequences_actual)

    data_x = np.zeros((len(data), max_sequences, max_sequence_len), dtype='int32')
    print("Max. Seq. Length: %d; Max Seq.: %d" %(max_sequence_len, max_sequences))

    index_filter = set()
    if filter_words:
        for word, i in tokenizer.word_index.items():
            if not (word.isalpha() or "'" in word or "-" in word):
                index_filter.add(i)

    for i, sentences in enumerate(raw_data):
        for j, sentence in enumerate(sentences):
            if j < max_sequences:
                k = 0
                word_tokens = text_to_word_sequence(' '.join(sentence.split()), filters='!"#$%&()*+,./:;<=>?@[\]^_`{|}~', lower=True)
                for word in word_tokens:
                    if k < max_sequence_len:
                        if not filter_words or tokenizer.word_index[word] not in index_filter:
                                data_x[i, j, k] = tokenizer.word_index[word]
                        k = k + 1
    return data_x, tokenizer, max_sequence_len, max_sequences
コード例 #3
0
def load_data(total_words):
    process_num = 10
    executor = concurrent.futures.ProcessPoolExecutor(process_num)
    base = 0
    results = []
    history = []
    true_utt = []
    word_dict = dict()
    vectors = []
    with open('data/glove.twitter.27B.200d.txt', encoding='utf8') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.split(' ')
            word_dict[line[0]] = i
            vectors.append(line[1:])
            if i > total_words:
                break
    with open('worddata/embedding_matrix.pkl', "wb") as f:
        pickle.dump(vectors, f)
    with open("data/biglearn_train.old.txt", encoding="utf8") as f:
        lines = f.readlines()
        total_num = 1000000
        print(total_num)
        low = 0
        step = total_num // process_num
        print(step)
        while True:
            if low < total_num:
                results.append(
                    executor.submit(build_data, lines[low:low + step],
                                    word_dict, base))
            else:
                break
            base += 1
            low += step

        for result in results:
            h, t = result.result()
            history += h
            true_utt += t
    print(len(history))
    print(len(true_utt))
    pickle.dump([history, true_utt], open("worddata/train.pkl", "wb"))
    actions_id = []
    with open('emb/actions.txt', encoding='utf8') as f:
        actions = f.readlines()

    def word2id(c):
        if c in word_dict:
            return word_dict[c]
        else:
            return 0

    for action in actions:
        actions_id.append(
            [word2id(word) for word in text_to_word_sequence(action)])
    with open('worddata/actions_embeddings.pkl', 'wb') as f:
        pickle.dump(actions_id, f)
コード例 #4
0
def build_data(lines, word_dict, tid=0):
    def word2id(c):
        if c in word_dict:
            return word_dict[c]
        else:
            return 0

    cnt = 0
    history = []
    true_utt = []
    for line in lines:
        fields = line.rstrip().lower().split('\t')
        utterance = fields[1].split('###')
        history.append([list(map(word2id, text_to_word_sequence(each_utt))) for each_utt in utterance])
        true_utt.append(list(map(word2id, text_to_word_sequence(fields[2]))))
        cnt += 1
        if cnt % 10000 == 0:
            print(tid, cnt)
    return history, true_utt
コード例 #5
0
ファイル: textgenrnn.py プロジェクト: kurecka/textgenrnn
    def encode_text_vectors(self,
                            texts,
                            pca_dims=50,
                            tsne_dims=None,
                            tsne_seed=None,
                            return_pca=False,
                            return_tsne=False):

        # if a single text, force it into a list:
        if isinstance(texts, str):
            texts = [texts]

        vector_output = Model(inputs=self.model.input,
                              outputs=self.model.get_layer('attention').output)
        encoded_vectors = []
        maxlen = self.config['max_length']
        for text in texts:
            if self.config['word_level']:
                text = text_to_word_sequence(text, filters='')
            text_aug = [self.META_TOKEN] + list(text[0:maxlen])
            encoded_text = textgenrnn_encode_sequence(text_aug, self.vocab,
                                                      maxlen)
            encoded_vector = vector_output.predict(encoded_text)
            encoded_vectors.append(encoded_vector)

        encoded_vectors = np.squeeze(np.array(encoded_vectors), axis=1)
        if pca_dims is not None:
            assert len(texts) > 1, "Must use more than 1 text for PCA"
            pca = PCA(pca_dims)
            encoded_vectors = pca.fit_transform(encoded_vectors)

        if tsne_dims is not None:
            tsne = TSNE(tsne_dims, random_state=tsne_seed)
            encoded_vectors = tsne.fit_transform(encoded_vectors)

        return_objects = encoded_vectors
        if return_pca or return_tsne:
            return_objects = [return_objects]
        if return_pca:
            return_objects.append(pca)
        if return_tsne:
            return_objects.append(tsne)

        return return_objects
コード例 #6
0
def load_seq2vec_data(account, mod):

    data = joblib.load(os.path.join(accounts_dir,
                                    account + '_tweets.pkl')).text.to_list()
    date = joblib.load(os.path.join(accounts_dir,
                                    account + '_tweets.pkl')).index.to_list()

    word_index, max_len = joblib.load(
        os.path.join(pre_data_dir, mod, '{0}_word_index.pkl'.format(mod)))
    new = [
        text_pre.text_to_word_sequence(
            tweet, filters='!"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n')
        for tweet in data
    ]

    x_pred_seq2vec = []
    id_to_drop = []
    counter_ind = 0

    for tweet in new:
        sentence = [
            word_index.get(i) for i in tweet if word_index.get(i) != None
        ]

        if len(sentence) < 1:
            x_pred_seq2vec.append(['1', '1', '1', '1'])
            id_to_drop.append(date[counter_ind])
        else:
            x_pred_seq2vec.append(sentence)

        counter_ind += 1

    x_pred_seq2vec = sequence.pad_sequences(x_pred_seq2vec, maxlen=max_len)

    joblib.dump(x_pred_seq2vec,
                os.path.join(pre_pred_dir, account, mod, 'x_pred_seq2vec.pkl'))
    joblib.dump(id_to_drop,
                os.path.join(pre_pred_dir, account, mod, 'id_to_drop.pkl'))

    return x_pred_seq2vec
コード例 #7
0
def process(sentence):
    """ Applies word tokenizing to input sentence and removes the stopwords. """
    tokenized_words = text_to_word_sequence(sentence)
    words = [word for word in tokenized_words if word not in STOPWORDS]
    return ' '.join(words)
コード例 #8
0
ファイル: textgenrnn.py プロジェクト: kurecka/textgenrnn
    def train_on_texts(self,
                       texts,
                       context_labels=None,
                       batch_size=128,
                       num_epochs=50,
                       verbose=1,
                       new_model=False,
                       gen_epochs=1,
                       train_size=1.0,
                       max_gen_length=300,
                       validation=True,
                       dropout=0.0,
                       via_new_model=False,
                       save_epochs=0,
                       multi_gpu=False,
                       **kwargs):

        if new_model and not via_new_model:
            self.train_new_model(texts,
                                 context_labels=context_labels,
                                 num_epochs=num_epochs,
                                 gen_epochs=gen_epochs,
                                 train_size=train_size,
                                 batch_size=batch_size,
                                 dropout=dropout,
                                 validation=validation,
                                 save_epochs=save_epochs,
                                 multi_gpu=multi_gpu,
                                 **kwargs)
            return

        if context_labels:
            context_labels = LabelBinarizer().fit_transform(context_labels)

        if 'prop_keep' in kwargs:
            train_size = prop_keep

        if self.config['word_level']:
            texts = [text_to_word_sequence(text, filters='') for text in texts]

        # calculate all combinations of text indices + token indices
        indices_list = [
            np.meshgrid(np.array(i), np.arange(len(text) + 1))
            for i, text in enumerate(texts)
        ]
        indices_list = np.block(indices_list)

        # If a single text, there will be 2 extra indices, so remove them
        # Also remove first sequences which use padding
        if self.config['single_text']:
            indices_list = indices_list[self.config['max_length']:-2, :]

        indices_mask = np.random.rand(indices_list.shape[0]) < train_size

        if multi_gpu:
            num_gpus = len(K.tensorflow_backend._get_available_gpus())
            batch_size = batch_size * num_gpus

        gen_val = None
        val_steps = None
        if train_size < 1.0 and validation:
            indices_list_val = indices_list[~indices_mask, :]
            gen_val = generate_sequences_from_texts(texts, indices_list_val,
                                                    self, context_labels,
                                                    batch_size)
            val_steps = max(
                int(np.floor(indices_list_val.shape[0] / batch_size)), 1)

        indices_list = indices_list[indices_mask, :]

        num_tokens = indices_list.shape[0]
        assert num_tokens >= batch_size, "Fewer tokens than batch_size."

        level = 'word' if self.config['word_level'] else 'character'
        print("Training on {:,} {} sequences.".format(num_tokens, level))

        steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1)

        gen = generate_sequences_from_texts(texts, indices_list, self,
                                            context_labels, batch_size)

        base_lr = 4e-3

        # scheduler function must be defined inline.
        def lr_linear_decay(epoch):
            return (base_lr * (1 - (epoch / num_epochs)))

        if context_labels is not None:
            if new_model:
                weights_path = None
            else:
                weights_path = "{}_weights.hdf5".format(self.config['name'])
                self.save(weights_path)

            self.model = textgenrnn_model(self.num_classes,
                                          dropout=dropout,
                                          cfg=self.config,
                                          context_size=context_labels.shape[1],
                                          weights_path=weights_path)

        model_t = self.model

        if multi_gpu:
            # Do not locate model/merge on CPU since sample sizes are small.
            parallel_model = multi_gpu_model(self.model,
                                             gpus=num_gpus,
                                             cpu_merge=False)
            parallel_model.compile(loss='categorical_crossentropy',
                                   optimizer=RMSprop(lr=4e-3, rho=0.99))

            model_t = parallel_model
            print("Training on {} GPUs.".format(num_gpus))

        model_t.fit_generator(gen,
                              steps_per_epoch=steps_per_epoch,
                              epochs=num_epochs,
                              callbacks=[
                                  LearningRateScheduler(lr_linear_decay),
                                  generate_after_epoch(self, gen_epochs,
                                                       max_gen_length),
                                  save_model_weights(self, num_epochs,
                                                     save_epochs)
                              ],
                              verbose=verbose,
                              max_queue_size=10,
                              validation_data=gen_val,
                              validation_steps=val_steps)

        # Keep the text-only version of the model if using context labels
        if context_labels is not None:
            self.model = Model(inputs=self.model.input[0],
                               outputs=self.model.output[1])
コード例 #9
0
 def split_and_pad(input_string):
     tokenised_string = text_to_word_sequence(input_string)
     return ' '.join(
         tokenised_string[:min(self.max_seq_len, len(tokenised_string)
                               )])
コード例 #10
0
 def test_text_to_word_sequence_unicode_multichar_split(self):
   text = u'ali!stopveli?stopkırkstopdokuzstopelli'
   seq = preprocessing_text.text_to_word_sequence(text, split='stop')
   self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
コード例 #11
0
 def test_text_to_word_sequence_unicode(self):
   text = u'ali! veli? kırk dokuz elli'
   seq = preprocessing_text.text_to_word_sequence(text)
   self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
コード例 #12
0
 def test_text_to_word_sequence_multichar_split(self):
   text = 'hello!stop?world!'
   seq = preprocessing_text.text_to_word_sequence(text, split='stop')
   self.assertEqual(seq, ['hello', 'world'])
コード例 #13
0
 def test_text_to_word_sequence(self):
   text = 'hello! ? world!'
   seq = preprocessing_text.text_to_word_sequence(text)
   self.assertEqual(seq, ['hello', 'world'])
コード例 #14
0
try:
    stopword = stopwords.words('english')
except:
    nltk.download('stopwords')
    nltk.download('punkt')
    stopword = stopwords.words('english')

x = ' '.join(stopword)
nt = re.findall("\w+'t", x)
stopword = set(stopword) - ({'no', 'nor', 'not', 'only', 'too'} | set(nt))

# preprocessing of description
data['clean_docs'] = data['Description'].str.replace("[^a-zA-Z']", ' ')
data['clean_docs'] = data['clean_docs'].str.replace("'[a-su-zA-SU-Z]", ' ')
data['clean_docs'] = data['clean_docs'].str.replace(' +', ' ')
data['clean_docs'] = data['clean_docs'].str.lower()

from tensorflow.python.keras.preprocessing.text import text_to_word_sequence
data['clean_docs'] = data['clean_docs'].apply(
    lambda x:
    [t for t in text_to_word_sequence(x) if t not in stopword and len(t) > 1])

# remove empty row
data['clean_docs'].replace('', np.nan, inplace=True)
data = data[data['clean_docs'].notna()]

# concatenating all clean docs
data['clean_docs'] = data['clean_docs'].apply('#'.join)

data.to_csv('../src/database/final_perfume_data_clean_with_clean_docs.csv',
            index=False)
コード例 #15
0
def user_input_preprocessing(user_input):
    res = re.sub("[^a-zA-Z']", ' ', user_input)
    res = re.sub("'[a-su-zA-SU-Z]", ' ',res)
    res = text_to_word_sequence(res)
    return res
 def clean_sentence(self, sentence):
     return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')