Esempio n. 1
0
def give_vocabulary(sentences_df):
    '''
    @parameter: the dataframe from the json file with the 5 columns we need
    @returns: the vocabulary in a set.
    '''
    vocabulary = []
    list_of_sentences1 = sentences_df['sentence1'].tolist()
    list_of_sentences2 = sentences_df['sentence2'].tolist()
    list_sentence_words = []
    '''
    # Do same with keras
    for sentence in list_of_sentences1:
        sentence.lower()
        #tokenize or split by " "
        tokens1 = sentence.split(" ")
        for token1 in tokens1:
            if token1 not in vocabulary:
                vocabulary.append(token1)
    '''
    list_sentence_word_tmp = []
    for s1, s2 in zip (list_of_sentences1, list_of_sentences2):
        sentence_unicode1 = make_unicode(s1)
        sentence_unicode2 = make_unicode(s2)
        #print sentence_no_unicode
        list_sentence_word_tmp += text_to_word_sequence(sentence_unicode1.encode('ascii'), filters=base_filter(), lower=True, split=" ")
        list_sentence_word_tmp += text_to_word_sequence(sentence_unicode2.encode('ascii'), filters=base_filter(), lower=True, split=" ")

    set_words = set(list_sentence_word_tmp)
    #print word2idx
    print "length of vocabulary: %d"%len(set_words)
    return set_words
Esempio n. 2
0
def get_docs_and_intervention_summaries(pico_elem_str="CHAR_INTERVENTIONS"):
    pairs = []
    p = PDFBiViewer()
    for study in p: 
        cdsr_entry = study.cochrane 
        text = study.studypdf['text']
        intervention_text = cdsr_entry["CHARACTERISTICS"][pico_elem_str]
        if intervention_text is not None:
            #pairs.append((nltk.word_tokenize(text), 
            #              nltk.word_tokenize(intervention_text)))
            pairs.append((text_to_word_sequence(text), 
                          text_to_word_sequence(intervention_text)))

    return pairs 
def sentence2sequence(sentence, w2i):
    sequence = []
    words = text_to_word_sequence(sentence)
    for word in words:
        if word in w2i:
            sequence.append(w2i[word])
    return sequence
Esempio n. 4
0
    def fit(self,texts):
        sequences=[]
        for text in texts:
            sequences.append(text_to_word_sequence(text))
        for seq in sequences:
            for word in seq:
                if word in self.wordCount:
                    self.wordCount[word]+=1
                else:
                    self.wordCount[word]=1
        wcounts = list(self.wordCount.items())
        wcounts.sort(key = lambda x: x[1], reverse=True)

        self.encoder['<PAD>']=0
        self.encoder['<END>']=1
        self.encoder['<UNK>']=2

        self.decoder[0]='<PAD>'
        self.decoder[1]='<END>'
        self.decoder[2]='<UNK>'

        self.wordCount.clear()
        for i in range(len(wcounts)):
            pair=wcounts[i]
            self.encoder[pair[0]]=i+3
            self.decoder[i+3]=pair[0]
            if i<self.maxWords:
                self.wordCount[pair[0]]=pair[1]

        print('Most Frequent 20 words:')
        for i in range(min(20,len(wcounts))):
            print(wcounts[i])
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices
 def get_vectors_from_text(dataset_list,word_to_ind=imdb.get_word_index(),
                         start_char=1,
                         index_from=3,
                         maxlen=400,
                         num_words=5000,
                         oov_char=2,skip_top=0):
     '''
     Gets the list vector mapped according to the word to indices dictionary.
     
     @param
         dataset_list = list of review texts in unicode format
         word_to_ind = word to indices dictionary
         hyperparameters: start_char-->sentence starting after this char.
                         index_from-->indices below this will not be encoded.
                         max-len-->maximum length of the sequence to be considered.
                         num_words-->number of words to be considered according to the rank.Rank is
                                     given according to the frequency of occurence
                         oov_char-->out of variable character.
                         skip_top-->no of top rank words to be skipped
     @returns:
         x_train:       Final list of vectors(as list) of the review texts
     '''
     x_train = []
     for review_string in dataset_list:
         review_string_list = text_to_word_sequence(review_string)
         review_string_list = [ele for ele in review_string_list]
         
         x_predict = []
         for i in range(len(review_string_list)):
             if review_string_list[i] not in word_to_ind:
                 continue
             x_predict.append(word_to_ind[review_string_list[i]])
         x_train.append((x_predict))
     # add te start char and also take care of indexfrom
     if start_char is not None:
         x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
     elif index_from:
         x_train = [[w + index_from for w in x] for x in x_train]
     # only maxlen is out criteria
     x_train=[ele[:maxlen] for ele in x_train]
     # if num is not given take care
     if not num_words:
         num_words = max([max(x) for x in x_train])
     # by convention, use 2 as OOV word
     # reserve 'index_from' (=3 by default) characters:
     # 0 (padding), 1 (start), 2 (OOV)
     if oov_char is not None:
         x_train = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in x_train]
     else:
         x_train = [[w for w in x if (skip_top <= w < num_words)] for x in x_train]
     # padd the sequences
     x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
     # return the vectors form of the text
     return x_train
def generate_words_set(data_file):
    words = set()
    with open(data_file, 'r') as fr:
        lines = fr.readlines()
        for i in xrange(len(lines)):
            splits = lines[i].split('\t')
            text = splits[0] + ' ' + splits[1]
            for word in text_to_word_sequence(text):   # 模型训练时,文本数字化也要使用该函数text_to_word_sequence
                words.add(word)
    words = sorted(words)
    return words
Esempio n. 8
0
 def transform(self,texts):
     rv=[]
     for i in range(len(texts)):
         sequence=text_to_word_sequence(texts[i])
         if len(sequence)==0:
             rv.append([0])
             continue
         list_of_scalars=[]
         for j in range(len(sequence)):
             if sequence[j] not in self.wordCount:
                 list_of_scalars.append(self.encoder['<UNK>'])
             else:
                 list_of_scalars.append(self.encoder[sequence[j]])
         rv.append(list_of_scalars)
     return rv
    def _handle_rare_words(self, captions):
        if self._rare_words_handling == 'nothing':
            return captions
        elif self._rare_words_handling == 'discard':
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(captions)
            new_captions = []
            for caption in captions:
                words = text_to_word_sequence(caption)
                new_words = [w for w in words
                             if tokenizer.word_counts.get(w, 0) >=
                             self._words_min_occur]
                new_captions.append(' '.join(new_words))
            return new_captions

        raise NotImplementedError('rare_words_handling={} is not implemented '
                                  'yet!'.format(self._rare_words_handling))
Esempio n. 10
0
    def encode_text_vectors(self, texts, pca_dims=50, tsne_dims=None,
                            tsne_seed=None, return_pca=False,
                            return_tsne=False):

        # if a single text, force it into a list:
        if isinstance(texts, str):
            texts = [texts]

        vector_output = Model(inputs=self.model.input,
                              outputs=self.model.get_layer('attention').output)
        encoded_vectors = []
        maxlen = self.config['max_length']
        for text in texts:
            if self.config['word_level']:
                text = text_to_word_sequence(text, filters='')
            text_aug = [self.META_TOKEN] + list(text[0:maxlen])
            encoded_text = textgenrnn_encode_sequence(text_aug, self.vocab,
                                                      maxlen)
            encoded_vector = vector_output.predict(encoded_text)
            encoded_vectors.append(encoded_vector)

        encoded_vectors = np.squeeze(np.array(encoded_vectors), axis=1)
        if pca_dims is not None:
            assert len(texts) > 1, "Must use more than 1 text for PCA"
            pca = PCA(pca_dims)
            encoded_vectors = pca.fit_transform(encoded_vectors)

        if tsne_dims is not None:
            tsne = TSNE(tsne_dims, random_state=tsne_seed)
            encoded_vectors = tsne.fit_transform(encoded_vectors)

        return_objects = encoded_vectors
        if return_pca or return_tsne:
            return_objects = [return_objects]
        if return_pca:
            return_objects.append(pca)
        if return_tsne:
            return_objects.append(tsne)

        return return_objects
Esempio n. 11
0
X_val_text = list(X_val_text)

#------------get split point-----------
train_val_split = len(X_train_text)
val_test_split = int(len(X_val_text) / 2)

train_val = X_train_text + X_val_text

#------get the word index and split into in train,val,test set

data = np.zeros((len(train_val), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(train_val):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[
                        word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

#--------------now set data ready--------------------------------------

X_train = data[:train_val_split]
X_val = data[train_val_split:train_val_split + val_test_split]
X_test = data[train_val_split + val_test_split:]

#sys.exit(0)
#sys.exit(0)
Esempio n. 12
0
# #                     print (indices)
#                 except:
#                     continue
#     train_out[i] = indices
#     i=i+1
import nltk
from keras.preprocessing.text import text_to_word_sequence
raw_output = corpus.findall('.//sentence')
train_out = []
delet = []
print(data.shape)
data = np.array(data)
print(data.shape)
i=0
for output in raw_output:
    s = text_to_word_sequence(output.find('text').text, lower=True)
    indices = np.zeros(MAX_SEQ_LENGTH)
    
    aspectTerms = output.find('aspectTerms')
    if (aspectTerms):
        aspectTerm = aspectTerms.findall('aspectTerm')
        k=0
        if (len(aspectTerm)>0):
            for aspect_term in aspectTerm:
                try:
                    aspt = text_to_word_sequence(aspect_term.attrib['term'])
                    if(len(aspt) < 2):
                        indices[s.index(aspt[0])] = 1
                    else:
                        k=1
                        break
Esempio n. 13
0
#------------------------------------------------------------------------------#

#------------------------------------------------------------------------------#
########## Data Processing starts here #########################################
#------------------------------------------------------------------------------#
load_training_data()
#------------------------------------------------------------------------------#

#------------------------------------------------------------------------------#
text_words = []
for i in range(0, len(data)):
    temp_data = data['text'][i]
    temp_data_word = text_to_word_sequence(
        temp_data,
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=True,
        split=" ")
    text_words.append(temp_data_word)
#print (text_words)

word_counts = collections.Counter(itertools.chain(*text_words))
#print (word_counts)
vocabulary_size = len(word_counts)
print("vocabulary_size = %s" % vocabulary_size)
"""
## for debugging only
temp_list = []
for i in range (0, len(text_words)):
    temp_list.append(len(text_words[i]))
print (heapq.nlargest(10, temp_list))
Esempio n. 14
0
embedding_model.compile(loss='categorical_crossentropy',
                        optimizer='rmsprop',
                        metrics=['acc'])
embedding_output = embedding_model.predict(data)
print('Generated word Embeddings..')
print('Shape of Embedding_output', embedding_output.shape)

train_input = np.zeros(shape=(len(data), 69, 306))
le = preprocessing.LabelEncoder()
tags = ["CC", "NN", "JJ", "VB", "RB", "IN"]
le.fit(tags)
i = 0

for sent in textPost:
    s = text_to_word_sequence(sent)
    tags_for_sent = nltk.pos_tag(s)
    sent_len = len(tags_for_sent)
    ohe = [0] * 6

    for j in range(69):
        if j < len(tags_for_sent) and tags_for_sent[j][1][:2] in tags:
            ddLe = le.transform([tags_for_sent[j][1][:2]])
            ohe[ddLe[0]] = 1
        train_input[i][j] = np.concatenate([embedding_output[i][j], ohe])
    i = i + 1
print('Concatenated Word-Embeddings and POS Tag Features...')

print('Training Model...')
model = Sequential()
model.add(Conv1D(100, 5, padding="same", input_shape=(69, 306)))
Esempio n. 15
0
def textprcoessingforword2vec(input_data):
    sentences = []
    for i in range(len(input_data)):
        sentences.append(text_to_word_sequence(input_data[i],filters='\t\n'+"'",split=" "))
    return sentences
Esempio n. 16
0
 def similarity(self, x, y):
     x_vec = self.doc2vec.infer_vector(text_to_word_sequence(x),
                                       steps=self.steps)
     y_vec = self.doc2vec.infer_vector(text_to_word_sequence(y),
                                       steps=self.steps)
     return 1 - cosine(x_vec, y_vec)
Esempio n. 17
0
def clean(text):
    return text_to_word_sequence(
        text,
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=True,
        split=" ")
Esempio n. 18
0
    if (record.find('[report_end]') != -1):
        content = record[record.find('\n') +
                         1:record.find('[report_end]')].strip()
        content = expand_abbr(content)
        content = content.replace('\'s', " 's").replace("'d", " 'd")
        content = content.replace("'s", " 's")
        content = content.replace("can't", "cannot")
        content = content.replace("couldn't", "could not")
        content = content.replace("won't", "will not")
        content = content.replace("wasn't", "was not")
        content = content.replace("hasn't", "has not")
        content = content.replace("don't", "do not")
        content = content.replace("didn't", "did not")
        content = content.replace("doesn't", "does not")

        word_list = text.text_to_word_sequence(content, lower=True, split=" ")
        word_list = clean_wds(word_list)
        #filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        str_to_write = ' '
        str_to_write = str_to_write.join(word_list)
        corpus.append(str_to_write)
        corpus_file.write(str_to_write + '\n')
print(len(corpus))
corpus_file.close()
train_dic = get_dic('data/Obesity_data/train_groundtruth.xml')
test_dic = get_dic('data/Obesity_data/test_groundtruth.xml')

# Read Word Vectors
word_vector_file = 'data/mimic3_pp100.txt'
vocab, embd, word_vector_map = loadWord2Vec(word_vector_file)
embedding_dim = len(embd[0])
Esempio n. 19
0
    if not os.path.isdir(fpath):
        program = []
        with open(fpath) as fr:
            lines = fr.readlines()
            for row in range(len(lines)):
                if "-------" not in lines[row]:
                    program.append(lines[row].strip("\n"))
                else:
                    labels.append(int(program[-1]))
                    program.pop(0)
                    program.pop(-1)
                    texts.append(" ".join(program))
                    program = []
print("Found %s programs." % len(texts))

texts = text_to_word_sequence(" ".join(texts),
                              filters='!"#$%&()*+,./:;<=>?@[]^`|~')
with open("word1.txt", "w") as fw:
    fw.write(" ".join(texts))

texts = list(set(texts))
i = 0
sequences = word2vec.Text8Corpus("./word1.txt")
model = word2vec.Word2Vec(sequences, size=150, min_count=1)
with open("word2vector1.txt", "w") as fw:
    writer = csv.writer(fw, delimiter=" ")
    for word in texts:
        # print(model[word].tolist())
        try:
            print(i)
            i += 1
            writer.writerow([word] + model[word].tolist())
Esempio n. 20
0
def context_question_text_preprocess(cnt_max_len, qn_max_len):
    """
    get corpus
    """
    # file path name
    file_train_context = '../data/train.context'
    file_train_question = '../data/train.question'
    file_dev_context = '../data/dev.context'
    file_dev_question = '../data/dev.question'
    file_train_span = '../data/train.span'
    file_dev_span = '../data/dev.span'

    # text and index list
    txt_train_cnt = read_txt_from_file(file_train_context)
    txt_train_qst = read_txt_from_file(file_train_question)
    txt_dev_cnt = read_txt_from_file(file_dev_context)
    txt_dev_qst = read_txt_from_file(file_dev_question)
    idx_train_beg, idx_train_end = read_index_from_file(
        file_train_span, cnt_max_len)
    idx_dev_beg, idx_dev_end = read_index_from_file(file_dev_span, cnt_max_len)

    cnt_all_txt = txt_train_cnt + txt_dev_cnt
    qst_all_txt = txt_train_qst + txt_dev_qst

    # from keras.preprocessing.text import Tokenizer
    # 求 context 和 question 的长度列表
    l_cnt = list(map(lambda x: len(T.text_to_word_sequence(x)), cnt_all_txt))
    l_qst = list(map(lambda x: len(T.text_to_word_sequence(x)), qst_all_txt))

    # 求 context 和 question 的平均长度(词)
    import functools
    l_all_cnt = functools.reduce(lambda x, y: x + y, l_cnt)
    l_all_qst = functools.reduce(lambda x, y: x + y, l_qst)
    l_average_cnt = l_all_cnt / len(cnt_all_txt)
    l_average_qst = l_all_qst / len(qst_all_txt)

    # 分词
    t = Tokenizer()  # 分词器
    txt_list = txt_train_cnt + txt_train_qst + txt_dev_cnt + txt_dev_qst
    t.fit_on_texts(txt_list)
    vocab_size = len(t.word_index) + 1

    enc_txt_train_cnt = t.texts_to_sequences(txt_train_cnt)
    enc_txt_train_qst = t.texts_to_sequences(txt_train_qst)
    enc_txt_dev_cnt = t.texts_to_sequences(txt_dev_cnt)
    enc_txt_dev_qst = t.texts_to_sequences(txt_dev_qst)

    pad_txt_train_cnt = pad_sequences(enc_txt_train_cnt,
                                      maxlen=cnt_max_len,
                                      padding='post')
    pad_txt_train_qst = pad_sequences(enc_txt_train_qst,
                                      maxlen=qn_max_len,
                                      padding='post')
    pad_txt_dev_cnt = pad_sequences(enc_txt_dev_cnt,
                                    maxlen=cnt_max_len,
                                    padding='post')
    pad_txt_dev_qst = pad_sequences(enc_txt_dev_qst,
                                    maxlen=qn_max_len,
                                    padding='post')

    # load embedding
    embeddings_index = load_emb()

    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, 50))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    print("context average number of character is {}".format(l_average_cnt))
    print("context max number of character is {}".format(cnt_max_len))
    print("question average number of character is {}".format(l_average_qst))
    print("question max number of character is {}".format(qn_max_len))

    print("index of answer is index of word, not character")

    return embedding_matrix, vocab_size, pad_txt_train_cnt, pad_txt_train_qst, pad_txt_dev_cnt, pad_txt_dev_qst, \
           idx_train_beg, idx_train_end, idx_dev_beg, idx_dev_end
Esempio n. 21
0
import keras.preprocessing.text as T


text = '''
本文介绍keras提供的预处理包keras.preproceing下的text与序列处理模块sequence模块 2 text模块提供的方法 text_to_word_sequence(text,fileter) 可以
'''
print(T.text_to_word_sequence(text=text, filters=))

df = drop_useless_features(df, [
    "helpful", "reviewTime", "reviewerID", "unixReviewTime", "asin",
    "reviewerName", "overall", "summary"
])

# Drop Nan values as they are mostly the columns where the overall column has
# Neutral sentiment or 3.0. This is not our target anyway
df = df.dropna()

# Removing punctuations
# Converting to Lowercase and cleaning punctuations

df['reviewText'] = df['reviewText'].apply(
    lambda x: ' '.join(text_to_word_sequence(x)))

# removing numbers from the column of reviewText

df['reviewText'] = df['reviewText'].str.replace('\d+', '')

# Plot positive and negative rating
plot_size = plt.rcParams["figure.figsize"]
plot_size[0] = 8
plot_size[1] = 6
plt.rcParams["figure.figsize"] = plot_size

df.rating.value_counts().plot(kind='pie', autopct='%1.0f%%')
df.reviewText.str.len().max()

# prepare tokenizer
Esempio n. 23
0
final_stop_words = set(
    [word for word in stopword if word not in not_stopwords])
speller = Speller()

for i in range(len(df['comments'])):
    df['comments'][i] = re.sub("[0-9]+", " ", str(
        df['comments'][i]))  #removing digits, since they're not important
    df['comments'][i] = deEmojify(df['comments'][i])
    df['comments'][i] = strip_punctuation(df['comments'][i])
    df['comments'][i] = ' '.join(
        speller(word) for word in df['comments'][i].split() if word not in
        final_stop_words)  #removing stopwords and spell-correcting

max_sent_len = 100
max_vocab_size = 1500
word_seq = [text_to_word_sequence(comment) for comment in df['comments']]
# print(word_seq)

# vectorizing a text corpus, turning each text into either a sequence of integers (each integer being the index of a token in a dictionary)
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(
    [' '.join(seq[:max_sent_len]) for seq in word_seq]
)  #Updates internal vocabulary based on a list of texts up to the max_sent_len.
# print("vocab size: ", len(tokenizer.word_index)) #vocab size: 949

#converting sequence of words to sequence of indices
X = tokenizer.texts_to_sequences(
    [' '.join(seq[:max_sent_len]) for seq in word_seq])
X = pad_sequences(X, maxlen=max_sent_len, padding='post', truncating='post')
# X = np.expand_dims(X, axis =2) #reshape X to 3 dimensions
# X = np.reshape(X, (X.shape[0], X.shape[1], 1))
def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]
Esempio n. 25
0
 def tokenize(self, text):
     return text_to_word_sequence(text)
Esempio n. 26
0
def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]
Esempio n. 27
0
def get_nomalized_test_data(train_vocab):
    #get the train data
    data = get_data.Datasets()
    test_data = data.get_test_data()
    test_reviews = []
    test_sentences = []
    test_labels = []

    #test data preprocessing ----------

    #clean the test dataset
    for test in test_data["review"]:
        cleaned_test = data.clean_text_to_text(test)
        test_reviews.append(cleaned_test)
        sentences = tokenize.sent_tokenize(cleaned_test)
        test_sentences.append(sentences)

    #define the label
    for id in test_data["id"]:
        # print(id)
        id = id.strip('"')
        # print(id)
        id, score = id.split('_')
        score = int(score)
        if (score < 5):
            test_labels.append(0)
        if (score >= 7):
            test_labels.append(1)

    #test----
    #create a tokenizer and limit only dealt with top 20000 words
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(test_reviews)

    print("train_vocab")
    print(train_vocab)
    print(len(train_vocab))

    #define the test_matrix
    test_matrix = np.zeros((len(test_reviews), SEN_NUM, WORDS_NUM),
                           dtype='int32')  #(250000,15,100)

    #print(test_matrix.shape)
    non_exist = 0
    for review_index, review in enumerate(test_sentences):

        for sentence_index, sentence in enumerate(review):

            if (sentence_index < SEN_NUM):
                #print(sentence)
                tokens = text_to_word_sequence(sentence)

                num = 0

                for _, token in enumerate(tokens):
                    #see if the token is in the vocab

                    if (token not in train_vocab.keys()):
                        print(token)
                        non_exist += 1
                        continue
                    if (num < WORDS_NUM and train_vocab[token] < MAX_NB_WORDS):
                        test_matrix[review_index, sentence_index,
                                    num] = train_vocab[token]
                        num += 1

    print(non_exist)
    #test_labels-> tocategory

    predicted_labels = to_categorical(np.asarray(test_labels))

    return test_matrix, predicted_labels
Esempio n. 28
0
 def similarity(self, x, y):
     x = text_to_word_sequence(x)
     y = text_to_word_sequence(y)
     return -self.word2vec.wv.wmdistance(x, y)
Esempio n. 29
0
def run_model(df, MAX_WORDS, MAX_SENTS, MAX_SENT_LENGTH, VALIDATION_SPLIT, EMBEDDING_DIM):
    x = df['Text']
    y = df['Class']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000, stratify=y)

    # split data into sentences, using spacy sentencizer
    # since we are only using spacy for sentencizing, we only invoke the sentencizer
    nlp = spacy.blank("en")
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    x_train_sentences = pd.Series([doc for doc in nlp.pipe(x_train)]).apply(lambda x: [sent for sent in x.sents])

    # oov_token=True reserves a token for unknown words (rather than ignoring the word)
    tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token=True)
    tokenizer.fit_on_texts(x_train.values)

    data = np.zeros((len(x_train_sentences), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

    # i is each sample index number
    # samples is each sample
    for i, samples in enumerate(x_train_sentences):
        # j is the sentence index number
        # sentences is each sentence
        for j, sentences in enumerate(samples):
            if j < MAX_SENTS:
                # wordTokens is list of tokens
                wordTokens = text_to_word_sequence(str(sentences))
                k = 0
                # word is each individual token
                for _, word in enumerate(wordTokens):
                    if k < MAX_SENT_LENGTH:
                        if word not in tokenizer.word_index:  # remove special characters
                            continue
                        if tokenizer.word_index[word] < MAX_WORDS:
                            data[i, j, k] = tokenizer.word_index[word]
                            k += 1

    print('Total %s unique tokens.' % len(tokenizer.word_index))

    labels = pd.get_dummies(y_train.values)

    print('Shape of samples tensor:', data.shape)
    print('Shape of labels tensor:', labels.shape)

    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]

    print('Number of samples of each class in training and validation set')
    print(y_train.sum(axis=0))
    print(y_val.sum(axis=0))

    embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                EMBEDDING_DIM,
                                input_length=MAX_SENT_LENGTH,
                                trainable=True)

    word_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    word_sequences = embedding_layer(word_input)
    word_lstm = Bidirectional(LSTM(100, return_sequences=True))(word_sequences)
    word_dense = TimeDistributed(Dense(200))(word_lstm)
    word_att = AttentionWithContext()(word_dense)
    wordEncoder = Model(word_input, word_att)

    sent_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    sent_encoder = TimeDistributed(wordEncoder)(sent_input)
    sent_lstm = Bidirectional(LSTM(100, return_sequences=True))(sent_encoder)
    sent_dense = TimeDistributed(Dense(200))(sent_lstm)
    sent_att = AttentionWithContext()(sent_dense)
    predictions = Dense(20, activation='softmax')(sent_att)
    model = Model(sent_input, predictions)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])

    print(model.summary())

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

    history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
                        epochs=50, batch_size=50, callbacks=[es])

    # vectorize test set the same way
    x_test_sentences = pd.Series([doc for doc in nlp.pipe(x_test)]).apply(lambda x: [sent for sent in x.sents])

    x_test_sentences = []
    for doc in nlp.pipe(x_test):
        x_test_sentences.extend(sent.text for sent in doc.sents)

    test_data = np.zeros((len(x_test_sentences), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    test_labels = pd.get_dummies(y_test.values)

    print('Shape of test samples tensor:', test_data.shape)
    print('Shape of test labels tensor:', test_labels.shape)
    print('Number of samples of each class in test set: ', test_labels.sum(axis=0))

    for i, samples in enumerate(x_test_sentences):
        # j is the sentence number
        # sentences is each sentence
        for j, sentences in enumerate(samples):
            if j < MAX_SENTS:
                # wordTokens is list of tokens
                wordTokens = text_to_word_sequence(str(sentences))
                k = 0
                # word is each individual token
                for _, word in enumerate(wordTokens):
                    if k < MAX_SENT_LENGTH:
                        if word not in tokenizer.word_index:  # remove special characters
                            continue
                        if tokenizer.word_index[word] < MAX_WORDS:
                            test_data[i, j, k] = tokenizer.word_index[word]
                            k = k + 1

    x_test = test_data
    y_test_single = y_test # need this for classification report f1
    y_test = test_labels

    loss, accuracy = model.evaluate(x_train, y_train, verbose=2)
    print("Training Accuracy: {:.4f}".format(accuracy))

    loss, accuracy = model.evaluate(x_test, y_test, verbose=2)
    print("Testing Accuracy:  {:.4f}".format(accuracy))

    plot_history(history)

    y_pred = np.argmax(model.predict(x_test), axis=1) + 1
    cm = confusion_matrix(y_test_single, y_pred, labels=None, sample_weight=None)
    print("\nClassification report summary:")
    print(classification_report(y_test_single, y_pred, labels=[i + 1 for i in range(20)], digits=3))

    return model, cm
Esempio n. 30
0
#import data after preprocessing
filename="C:/Documents/AI/RNN ass/cleanedup-news-file-test.csv"
#read data into raw text
raw_text = open(filename, encoding="utf8").read()
#lowercase
raw_text = raw_text.lower()
#transfrom punctuations into integer
dropPunctuation = str.maketrans("", "", string.punctuation)
raw_text = raw_text.translate(dropPunctuation)
#define start time
start_time1 = datetime.datetime.now()
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Pretreat Data Section
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# create mapping of unique words to integers
lines = text_to_word_sequence(raw_text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
encoded = tokenizer.texts_to_sequences(lines)
encoded_sequence=[]
for list in encoded:
    encoded_sequence.append(list[0])
sequences=[]
#create sequence for training
for i in range(0, len(encoded_sequence)-seq_length-1):
	sequence = encoded_sequence[i:i+seq_length+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
Esempio n. 31
0
File: CBOW.py Progetto: tung2921/NLP
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from nltk.corpus import gutenberg
from string import punctuation
bible = gutenberg.sents('bible-kjv.txt')
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent if word not in remove_terms]
              for sent in bible]

norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]

norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index
# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)]
        for doc in norm_bible]
vocab_size = len(word2id)
embed_size = 100
window_size = 2  # context window size
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])
Esempio n. 32
0
def create_dataset():
    """
    Create the IMDB dataset as numpy arrays.

    """
    st = time.time()
    print('Constructing dataset...')
    data_train = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
    data_test = pd.read_csv('data/testData.tsv', sep='\t')

    from nltk import tokenize
    get_punkt_if_necessary(tokenize)

    reviews = []
    labels = []
    texts = []

    for idx in range(data_train.review.shape[0]):
        text = BeautifulSoup(data_train.review[idx], features="html.parser")
        text = clean_str(text.get_text().encode('ascii','ignore').decode())
        texts.append(text)
        sentences = tokenize.sent_tokenize(text)
        reviews.append(sentences)

        labels.append(data_train.sentiment[idx])


    for idx in range(data_test.review.shape[0]):
        text = BeautifulSoup(data_test.review[idx], features="html.parser")
        text = clean_str(text.get_text().encode('ascii','ignore').decode())
        texts.append(text) # texts is the raw text
        sentences = tokenize.sent_tokenize(text)
        reviews.append(sentences)

        if data_test.id[idx][-1] in "12345":
            labels.append(0)
        else:
            labels.append(1)

    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)

    print('Tokenizing...')
    data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

    for i, sentences in enumerate(reviews):
        for j, sent in enumerate(sentences):
            if j< MAX_SENTS:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<=MAX_NUM_WORDS:
                        data[i,j,k] = tokenizer.word_index[word]
                        k=k+1

    word_index = tokenizer.word_index
    print('Total %s unique tokens.' % len(word_index))

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    x_train = data[:25000]
    y_train = labels[:25000]
    x_val = data[25000:]
    y_val = labels[25000:]

    print('Number of positive and negative reviews in traing and validation set')

    print('Creating dataset takes {}s.'.format(time.time()-st))
    print('Storing dataset...')

    np.save('data/x_train.npy', x_train)
    np.save('data/y_train.npy', y_train)
    np.save('data/x_val.npy', x_val)
    np.save('data/y_val.npy', y_val)

    with open('data/word_index.pkl','wb') as f:
        pkl.dump(word_index, f)
Esempio n. 33
0
with open('data.txt', 'r') as file:
    text = file.read()
    lines = text.lower().split('\n')

from keras.preprocessing.text import text_to_word_sequence, Tokenizer

words = text_to_word_sequence(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)

vocabulary_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(lines)

subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        subsequence = sequence[:i + 1]
        subsequences.append(subsequence)

from keras.preprocessing.sequence import pad_sequences

sequence_length = max([len(sequence) for sequence in sequences])
sequences = pad_sequences(subsequences, maxlen=sequence_length, padding='pre')

from keras.utils import to_categorical

x, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocabulary_size)

from keras.models import Sequential
Esempio n. 34
0
    def parse(self):
        """
		parse json file to generate table and vocab dict
		"""
        print('start loading json file...')
        train_ann = json.load(open(self.config.train_annFile, 'r'))
        train_ques = json.load(open(self.config.train_questionFile, 'r'))
        val_ann = json.load(open(self.config.val_annFile, 'r'))
        val_ques = json.load(open(self.config.val_questionFile, 'r'))
        print('load completed!')

        questions_train_ls = []
        questions_val_ls = []
        answers_train_ls = []
        answers_val_ls = []

        self.parse_answer(train_ann, answers_train_ls)
        self.parse_answer(val_ann, answers_val_ls)
        print('complete parser train data')
        self.parse_question(train_ques, questions_train_ls)
        self.parse_question(val_ques, questions_val_ls)
        print('complete parser val data')

        assert len(questions_train_ls) == len(answers_train_ls)
        assert len(questions_val_ls) == len(answers_val_ls)
        # check the data integrity

        questions = ' '.join([x[2] for x in questions_train_ls])
        questions = questions + ' ' + ' '.join(
            [x[2] for x in questions_val_ls])
        q_counter = Counter(text_to_word_sequence(questions))

        questions = set(dict(q_counter.most_common(6000)).keys())
        # questions = set()
        an_ls = [x[2]
                 for x in answers_train_ls] + [x[2] for x in answers_val_ls]
        a_counter = Counter(an_ls)
        answers = set(dict(a_counter.most_common(3000)).keys())

        self.build_vocab(questions, answers)
        print('complete build vocab')

        question_id_answer_train_df = pd.DataFrame(
            data=answers_train_ls,
            columns=['image_id', 'question_id', 'answer'])
        question_id_answer_val_df = pd.DataFrame(
            data=answers_val_ls, columns=['image_id', 'question_id', 'answer'])
        question_id_question_train_df = pd.DataFrame(
            data=questions_train_ls,
            columns=['image_id', 'question_id', 'question'])
        question_id_question_val_df = pd.DataFrame(
            data=questions_val_ls,
            columns=['image_id', 'question_id', 'question'])

        question_id_answer_train_df['answer'] = question_id_answer_train_df[
            'answer'].apply(self.encode_answer)
        question_id_answer_val_df['answer'] = question_id_answer_val_df[
            'answer'].apply(self.encode_answer)

        question_id_question_train_df[
            'question'] = question_id_question_train_df['question'].apply(
                self.encode_question)
        question_id_question_val_df['question'] = question_id_question_val_df[
            'question'].apply(self.encode_question)

        self.train_data = pd.merge(question_id_question_train_df,
                                   question_id_answer_train_df,
                                   on=['image_id',
                                       'question_id']).drop(['question_id'],
                                                            axis=1)
        self.val_data = pd.merge(question_id_question_val_df,
                                 question_id_answer_val_df,
                                 on=['image_id',
                                     'question_id']).drop(['question_id'],
                                                          axis=1)
        self.train_sample_size = len(question_id_answer_train_df)
        self.val_sample_size = len(question_id_answer_val_df)
        print('train_sample_size:%d\n val_sample_size:%d ' %
              (self.train_sample_size, self.val_sample_size))
        self.data_cleaning('train')
        self.data_cleaning('val')
        print('remove data which can not find picture')
#!/usr/bin/python3
# coding: utf-8
# https://github.com/EliasCai/sentiment/blob/master/sentiment_words.py#L78
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import hashing_trick
##################################################################
## 1. text_to_word_sequence, one_hot, hashing_trick
texts = ['some thing to eat', 'some thing to drink']
print(text_to_word_sequence(texts[0]))  # ['some', 'thing', 'to', 'eat']; 简单的空格分开
print(one_hot(texts[0], 10))  # [5, 7, 5, 7]; (10 表示数字化向量为 10 以内的数字)
print(one_hot(texts[1], 10))  # [5, 7, 5, 5]; 因为内部调用了 hash, 所以能够在定了 (text, n) 之后对每个 str 赋值相同
# This is a wrapper to the `hashing_trick` function using `hash` as the hashing function, unicity of word to index mapping non-guaranteed.
##################################################################
## 2. Tokenizer: 索引就是出现的先后位置
# keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False)
# Tokenizer 是一个用于向量化文本, 或将文本转换为序列(即单词在字典中的下标构成的列表, 从 1 算起)的类.
# num_words: None 或整数, 处理的最大单词数量. 若被设置为整数, 则分词器将被限制为待处理数据集中最常见的 num_words 个单词
# char_level: 如果为 True, 每个字符将被视为一个标记
texts = ['some thing to eat', 'some thing to drink']
tmp_tokenizer = Tokenizer(num_words=None)  # num_words:None 或整数, 处理的最大单词数量; 少于此数的单词丢掉
tmp_tokenizer.fit_on_texts(texts)
# tmp_tokenizer.fit_on_texts(texts[0]); tmp_tokenizer.fit_on_texts(texts[1])  # 不能这样, 会按单个字母来统计
# 属性
print(tmp_tokenizer.word_counts)  # OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]); 在训练期间出现的次数
print(tmp_tokenizer.word_docs)  # {'thing': 2, 'eat': 1, 'to': 2, 'some': 2, 'drink': 1}; 在训练期间所出现的文档或文本的数量
print(tmp_tokenizer.word_index)  # {'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}; 排名或者索引
print(len(tmp_tokenizer.word_index))  # 5; 词典长度
print(tmp_tokenizer.index_docs)  # {2: 2, 4: 1, 3: 2, 1: 2, 5: 1}; 将 word_index 和 word_docs 合并
print(tmp_tokenizer.document_count)  # 2; 训练文档数
Esempio n. 36
0
    file = open("txt_sentoken/pos/" + str(file_name), "r")
    docs_list.append(file.read())

for file_name in negative_files_names:
    file = open("txt_sentoken/neg/" + str(file_name), "r")
    docs_list.append(file.read())

labels_positive = [1] * len(positive_files_names)
labels_negative = [0] * len(negative_files_names)

labels = labels_positive + labels_negative
labels = np.array(labels)

docs_tokens = []
for doc in docs_list:
    docs_tokens.append(text_to_word_sequence(doc))

Article = collections.namedtuple('Article', 'words tags paragraph')

tuples_list = []
for i in range(len(docs_tokens)):
    tuples_list.append(
        Article(words=docs_tokens[i], tags=[str(i)], paragraph=docs_list[i]))

tuples_list = shuffle(tuples_list)

# model = Doc2Vec(size=50,
#                 alpha=0.025,
#                 min_alpha=0.00025,
#                 min_count=1,
#                 dm =1)
Esempio n. 37
0
def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
batchSize = 128  # how many samples to feed neural network
GRU_UNITS = 256  # Number of nodes in GRU Layer
numClasses = 2  #{Positive,Negative}
iterations = 100000  # How many iterations to train
nodes_on_FC = 64  # Number of nodes on FC layer
epsilon = 1e-4  # For batch normalization
val_loop_iter = 50  # in how many iters we record

#Reading csv's
train = pd.read_csv('dataset/train_amazon.csv')
test = pd.read_csv('dataset/test_amazon.csv')

#Removing punctuations
#Converting to Lowercase and cleaning punctiations
train['text'] = train['text'].apply(
    lambda x: ' '.join(text_to_word_sequence(x)))
test['text'] = test['text'].apply(lambda x: ' '.join(text_to_word_sequence(x)))


def remove_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x


#Removing Numbers
train['text'] = train['text'].apply(lambda x: remove_numbers(x))
test['text'] = test['text'].apply(lambda x: remove_numbers(x))
Esempio n. 39
0
def preprocess_text(text):
    word_sequence = text_to_word_sequence(text)
    indices_sequence = [[word_index[word] if word in word_index else 0 for word in word_sequence]]
    x = tokenizer.sequences_to_matrix(indices_sequence, mode='binary')
    return x
    w2i, i2w = generate_w2i_i2w_dict(data_file)
    with open(data_file, 'r') as fr:
        for line in fr:
            query1, query2, label = line.split('\t')[:3]
            query = (sentence2sequence(query1, w2i), sentence2sequence(query2, w2i))
            X.append(query)
            y.append(int(label))
    X = np.array(X)
    y = np.array(y)
    return X, y

if __name__ == '__main__':
    # data_file = '../Files/yahoo.data.dat'
    data_file = '../Files/yahooAnswer.txt'
    sentences_file = '../Files/sentences.dat'
    words_file = '../Files/words.dat'

    # output = pd.read_csv('C:\Users\AC\PycharmProjects\NLPSimilarity\Files\yahoo.data.dat', names=['Query1', 'Query2', 'label', 'ID'], sep='\t')
    # print type(output)
    # print output
    # output = output.as_matrix()
    # print type(output)
    # print output

    words = generate_words_set(data_file)
    print '共有%d个词' % len(words)     # 61681
    sentences = generate_sentences_set(data_file)
    print '最长的句子包含%d个单词' % max([len(text_to_word_sequence(sentence)) for sentence in sentences])        # 788
    X, y = get_data(data_file)
    print type(X), len(X), X.ndim, X[0], X[0][0], X[0][1]  #    <type 'numpy.ndarray'> 20563 2 [[5859, 5489, 502, 5805, 3429, 9129] [5580, 5923, 10200, 3429, 9132]] [5859, 5489, 502, 5805, 3429, 9129] [5580, 5923, 10200, 3429, 9132]
    print type(y), len(y), y.ndim, y[0]     #   <type 'numpy.ndarray'> 20563 1
Esempio n. 41
0
def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
Esempio n. 42
0
total_texts.extend(word_not_in_texts)

fileRaw = open('./data/Trump_texts.txt', 'r')
raw_strings = list()
for line in fileRaw:
    raw_strings.append(text_cleaner(line))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(total_texts)

word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}

xtr = list()
for q in X_train:
    xtr.append(list(word2id[w] for w in text_to_word_sequence(q)))

xts = list()
for q in X_test:
    xts.append(list(word2id[w] for w in text_to_word_sequence(q)))

predict = list()
# for q in raw_strings:
#     l = list()
#     for w in text_to_word_sequence(q):
#         if w in word2id.keys():
#             l.append(word2id[w])
#         else:
#             l.append(word2id[word_not_in_texts[0]])
#     predict.append(l)
predict = list.copy(xtr)
    
SEQ_LEN_TR = len(max(train_df['question_text'], key=len).split())
SEQ_LEN_TS = len(max(test_df['question_text'], key=len).split())
SEQ_LEN = max(SEQ_LEN_TR,SEQ_LEN_TS)
print("SEQ_LEN:",SEQ_LEN)
assert SEQ_LEN == 45 

##
train_cat_list, train_text_list, train_questions = [], [], [] 
test_text_list, test_questions = [], []

for i in range(len(train_df)):
    quest = train_df.loc[i,'question_text']
    train_questions.append(quest)
    train_cat_list.append(train_df.loc[i,'target'])
    train_text_list.append(text_to_word_sequence(process_text(quest),lower=LOWER_CASE))

for i in range(len(test_df)):
    quest = test_df.loc[i,'question_text']
    test_questions.append(quest)
    test_text_list.append(text_to_word_sequence(process_text(quest),lower=LOWER_CASE))
    
assert len(train_cat_list) == len(train_text_list)
assert len(train_cat_list) == len(train_questions)
assert len(test_questions) == len(test_text_list)

print(">> train_size:",len(train_cat_list))
print(">> train sample:",train_cat_list[44] , train_text_list[44], train_questions[44])
print(">> test_size:",len(test_questions))
print(">> test sample:", test_text_list[44] , test_questions[44])
Esempio n. 44
0
    def train_on_texts(self, texts, context_labels=None,
                       batch_size=128,
                       num_epochs=50,
                       verbose=1,
                       new_model=False,
                       gen_epochs=1,
                       train_size=1.0,
                       max_gen_length=300,
                       validation=True,
                       dropout=0.0,
                       via_new_model=False,
                       **kwargs):

        if new_model and not via_new_model:
            self.train_new_model(texts,
                                 context_labels=context_labels,
                                 num_epochs=num_epochs,
                                 gen_epochs=gen_epochs,
                                 batch_size=batch_size,
                                 dropout=dropout,
                                 validation=validation,
                                 **kwargs)
            return

        if context_labels:
            context_labels = LabelBinarizer().fit_transform(context_labels)

        if 'prop_keep' in kwargs:
            train_size = prop_keep

        if self.config['word_level']:
            texts = [text_to_word_sequence(text, filters='') for text in texts]

        # calculate all combinations of text indices + token indices
        indices_list = [np.meshgrid(np.array(i), np.arange(
            len(text) + 1)) for i, text in enumerate(texts)]
        indices_list = np.block(indices_list)

        # If a single text, there will be 2 extra indices, so remove them
        # Also remove first sequences which use padding
        if self.config['single_text']:
            indices_list = indices_list[self.config['max_length']:-2, :]

        indices_mask = np.random.rand(indices_list.shape[0]) < train_size

        gen_val = None
        val_steps = None
        if train_size < 1.0 and validation:
            indices_list_val = indices_list[~indices_mask, :]
            gen_val = generate_sequences_from_texts(
                texts, indices_list_val, self, context_labels, batch_size)
            val_steps = max(
                int(np.floor(indices_list_val.shape[0] / batch_size)), 1)

        indices_list = indices_list[indices_mask, :]

        num_tokens = indices_list.shape[0]
        assert num_tokens >= batch_size, "Fewer tokens than batch_size."

        level = 'word' if self.config['word_level'] else 'character'
        print("Training on {:,} {} sequences.".format(num_tokens, level))

        steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1)

        gen = generate_sequences_from_texts(
            texts, indices_list, self, context_labels, batch_size)

        base_lr = 4e-3

        # scheduler function must be defined inline.
        def lr_linear_decay(epoch):
            return (base_lr * (1 - (epoch / num_epochs)))

        if context_labels is not None:
            if new_model:
                weights_path = None
            else:
                weights_path = "{}_weights.hdf5".format(self.config['name'])
                self.save(weights_path)

            self.model = textgenrnn_model(self.num_classes,
                                          dropout=dropout,
                                          cfg=self.config,
                                          context_size=context_labels.shape[1],
                                          weights_path=weights_path)

        self.model.fit_generator(gen, steps_per_epoch=steps_per_epoch,
                                 epochs=num_epochs,
                                 callbacks=[
                                     LearningRateScheduler(
                                         lr_linear_decay),
                                     generate_after_epoch(
                                         self, gen_epochs,
                                         max_gen_length),
                                     save_model_weights(
                                         self.config['name'])],
                                 verbose=verbose,
                                 max_queue_size=2,
                                 validation_data=gen_val,
                                 validation_steps=val_steps
                                 )

        # Keep the text-only version of the model if using context labels
        if context_labels is not None:
            self.model = Model(inputs=self.model.input[0],
                               outputs=self.model.output[1])
Esempio n. 45
0
df = pd.read_csv("test_test.csv")
df.columns = ['seq_label', 'sequence']
print(df['sequence'])
Ori = df['sequence'].tolist()
Orilabel = df["seq_label"].tolist()
from textwrap import wrap

# cut to kmers
kmer_size = 1
#cut to kmers
df['sequence'] = df.apply(lambda x: wrap(x['sequence'], kmer_size), axis=1)
df['sequence'] = [','.join(map(str, l)) for l in df['sequence']]
max_length = df.sequence.map(lambda x: len(x)).max()
max_length = max_length / kmer_size
df['sequence'] = df.apply(
    lambda x: text_to_word_sequence(x['sequence'], split=','), axis=1)
df['sequence'] = df['sequence'].astype(str)
vocab_max = 4**kmer_size
print(vocab_max)
# integer encode the document
df['sequence'] = df.apply(lambda x: one_hot(x['sequence'], vocab_max), axis=1)
print(df['sequence'])

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
dataset = df.values
Y = dataset[:, 0]
encoder_label = LabelEncoder()
encoder_label.fit(Y)
encoded_Y = encoder_label.transform(Y)
dummy_y = np_utils.to_categorical(encoded_Y)
Esempio n. 46
0
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']
Esempio n. 47
0
def main():
    input_filepath = './instagram.csv'
    output_filepath = './instagram.pickle'

    embeddings_index = {}
    with open('glove.twitter.27B.50d.txt', 'r', encoding='utf-8') as fin:
        for line in fin:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('total {0} word vectors'.format(len(embeddings_index)))

    df = pd.read_csv(input_filepath, encoding="ISO-8859-1", index_col=False)
    # fill nan cell
    df.fillna('', inplace=True)
    # shuffle the dataframe
    df = shuffle(df)
    print('data shape:', df.shape)

    labels = []
    comments_all = []  # all the comments
    social_content_all = []  # number of likes/shares/followed_by/follows for all the posts
    time_sequence_all = []  # time sequences for all the posts

    for _, session in df.iterrows():
        # number of likes/shares/followed_by/follows
        social_content = [int(re.findall(r'\d+', session['likes'])[0]), session['shared media'], session['followed_by'], session['follows']]
        social_content_all.append(social_content)

        label = session['question2']
        if label.startswith('n'):
            label = 0
        else:
            label = 1
        labels.append(label)

        post_time = ' '.join(session['cptn_time'].split()[-2:])  # datetime when the message is posted, %Y-%m-%d %H:%M:%S
        # handle corrupted time format -- some dates are missing the 2 at the head, length should be 19
        if len(post_time) == 18:
            post_time = '2' + post_time
        last_post_time = datetime.datetime.strptime(post_time, DTFormat)

        comments = []
        time_sequence = [0]  # time when the owner posts the picture

        for comment_idx in range(1, MAX_SENTS + 1):
            comment = session['clmn{0}'.format(comment_idx)]
            if comment.strip() and 'empety' not in comment:
                comment = comment.strip()
                identifier = '(created_at:'
                ts_start_idx = comment.find(identifier)
                if ts_start_idx != -1:
                    # comment timestamp
                    len_comment = len(comment)
                    ts = comment[ts_start_idx + len(identifier): len_comment - 1]
                    ts = datetime.datetime.strptime(ts, DTFormat)
                    time_lag = ts - last_post_time
                    time_sequence.append(time_lag.seconds)
                    last_post_time = ts

                    # comment text
                    comment = comment[: ts_start_idx]
                    comment = clean_str(comment)
                    comments.append(comment)

        comments_all.append(comments)
        time_sequence_all.append(time_sequence)

    pad_time_sequence_all = np.zeros((len(time_sequence_all), MAX_SENTS))
    for ts_idx, time_sequence in enumerate(time_sequence_all):
        pad_time_sequence_all[ts_idx][0: len(time_sequence)] = time_sequence
    # uniq_time_sequence_size = len(np.unique(pad_time_sequence_all))

    social_content_all = np.array(social_content_all)
    # uniq_social_content_size = len(np.unique(social_content_all))

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    words_all = [' '.join(comments) for comments in comments_all]
    tokenizer.fit_on_texts(words_all)
    word_index = tokenizer.word_index

    text_tensor = np.zeros((len(comments_all), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    print('total {0:,} unique tokens'.format(len(word_index)))

    for session_idx, comments in enumerate(comments_all):
        for comment_idx, comment in enumerate(comments):
            if comment_idx < MAX_SENTS:
                word_idx = 0
                for _, word in enumerate(text_to_word_sequence(comment)):
                    if word_idx < MAX_SENT_LENGTH and word_index[word] < MAX_NB_WORDS:
                        text_tensor[session_idx, comment_idx, word_idx] = word_index[word]
                        word_idx += 1

    # Hierarchical Attention Network for text and other info
    pad_time_sequence_all = np.delete(pad_time_sequence_all, range(MAX_SENTS, pad_time_sequence_all.shape[1]), axis=1)
    pad_time_sequence_all = preprocessing.StandardScaler().fit_transform(pad_time_sequence_all)
    social_content_all = preprocessing.StandardScaler().fit_transform(social_content_all)
    print('text_tensor shape:', text_tensor.shape)
    print('pad_time_sequence_all shape:', pad_time_sequence_all.shape)
    print('social_content_all shape:', social_content_all.shape)

    han_data = np.dstack((text_tensor, pad_time_sequence_all))
    print('Hierarchical Attention Network data shape (text + time):', han_data.shape)

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, idx in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector

    store_data = {'embedding_matrix': embedding_matrix,
                  'data': han_data,
                  'labels': labels,
                  'postInfo': social_content_all,
                  'timeInfo': pad_time_sequence_all,
                  'word_index': word_index,
                  'df': df}

    pickle.dump(store_data, open(output_filepath, 'wb'))
    print('successfully write to output file {0}'.format(output_filepath))
Esempio n. 48
0
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Esempio n. 49
0
    def tokenizer(self, train_all_doc, dev_all_doc, test_all_doc, maxsentence,
                  mylen):

        length_up_num = 0
        max_token = 0
        allfile = [train_all_doc, dev_all_doc, test_all_doc]

        wordvec = []
        all_document = []
        for file in allfile:
            documents = []
            for doc in file:
                temp = []
                word = []
                for sencent in doc:
                    sen = ""
                    token = text_to_word_sequence(
                        sencent,
                        filters='!"#$%&()*+,-.:;=?@[\]^`{|}/~',
                        lower=True,
                        split=" ")
                    max_token = max(max_token, len(token))
                    if (len(token) > mylen):
                        length_up_num += 1
                    temp.append(token)
                    for c in token:
                        sen = sen + c + " "
                    sen = sen.strip(" ")
                    word.append(sen)
                wordvec.append(word)
                documents.append(temp)
            all_document.append(documents)

        all_raw = []
        for file in all_document:
            for inst in file:
                instance = []
                for sencent in inst:
                    for word in sencent:
                        instance.append(word)
                all_raw.append(instance)

        all_dict = []
        all_dict.extend(all_raw)

        print('max token is ', max_token)
        print('up than ', mylen, ' num is', length_up_num)
        tokenizer = Tokenizer(filters='!"#$%&()*+,-.:;=/?@[\]^`{|}~\'>',
                              lower=True,
                              split=" ")
        tokenizer.fit_on_texts(all_dict)
        vocab_size = len(tokenizer.word_index) + 1
        print(tokenizer.word_docs)
        print('vocab', vocab_size)

        pad_sentence = []
        for i in range(mylen):
            pad_sentence.append(0)
        pad_sentence = np.array(pad_sentence)
        train_x = []
        for i in all_document[0]:
            train_index_x = tokenizer.texts_to_sequences(i)
            train_index_x = (list)(sequence.pad_sequences(train_index_x,
                                                          mylen,
                                                          padding='post',
                                                          truncating='post'))
            for j in range(len(train_index_x), maxsentence):
                train_index_x.append(pad_sentence)
            train_x.append(train_index_x)
        train_x = np.reshape(train_x, (len(train_x), maxsentence, mylen))

        dev_x = []
        for i in all_document[1]:
            train_index_x = tokenizer.texts_to_sequences(i)
            train_index_x = (list)(sequence.pad_sequences(train_index_x,
                                                          mylen,
                                                          padding='post',
                                                          truncating='post'))
            for j in range(len(train_index_x), maxsentence):
                train_index_x.append(pad_sentence)
            dev_x.append(train_index_x)
        dev_x = np.reshape(dev_x, (len(dev_x), maxsentence, mylen))

        test_x = []
        for i in all_document[2]:
            train_index_x = tokenizer.texts_to_sequences(i)
            train_index_x = (list)(sequence.pad_sequences(train_index_x,
                                                          mylen,
                                                          padding='post',
                                                          truncating='post'))
            for j in range(len(train_index_x), maxsentence):
                train_index_x.append(pad_sentence)
            test_x.append(train_index_x)
        test_x = np.reshape(test_x, (len(test_x), maxsentence, mylen))
        return train_x, dev_x, test_x, tokenizer, wordvec
Esempio n. 50
0
    text = clean_str(text.get_text().encode('ascii', 'ignore'))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)

    labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
Esempio n. 51
0
def convert_text_to_index_array(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]