def convert_to_vector(inst_list, tokenizer, MAX_SEQUENCE_LENGTH, delim): """ Prepare the data """ #ids = [] data = [] lab = [] for inst in inst_list: txt = aidrtokenize.tokenize(inst.text) text = " ".join(txt) if (len(txt) < 1): print("TEXT SIZE:" + txt) continue data.append(text) lab.append(inst.label) le = preprocessing.LabelEncoder() yL = le.fit_transform(lab) labels = list(le.classes_) label = yL.tolist() yC = len(set(label)) yR = len(label) y = np.zeros((yR, yC)) y[np.arange(yR), yL] = 1 y = np.array(y, dtype=np.int32) sequences = tokenizer.texts_to_sequences(data) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) return data, y, le, labels
def read_train_data(dataFile, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, delim): """ Prepare the data """ data = [] labels = [] with open(dataFile, 'rb') as f: next(f) for line in f: line = line.decode(encoding='utf-8', errors='strict') line = line.strip() if (line == ""): continue row = line.split(delim) txt = row[3].strip().lower() txt = aidrtokenize.tokenize(txt) label = row[6] if (len(txt) < 1): print (txt) continue data.append(txt) labels.append(label) data_shuf = [] lab_shuf = [] index_shuf = list(range(len(data))) random.shuffle(index_shuf) for i in index_shuf: data_shuf.append(data[i]) lab_shuf.append(labels[i]) le = preprocessing.LabelEncoder() yL = le.fit_transform(lab_shuf) labels = list(le.classes_) label = yL.tolist() yC = len(set(label)) yR = len(label) y = np.zeros((yR, yC)) y[np.arange(yR), yL] = 1 y = np.array(y, dtype=np.int32) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="OOV_TOK") tokenizer.fit_on_texts(data_shuf) sequences = tokenizer.texts_to_sequences(data_shuf) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) return data, y, le, labels, word_index, tokenizer
def read_dev_data(dataFile, tokenizer, MAX_SEQUENCE_LENGTH, delim, train_le): """ Prepare the data """ id_list=[] data = [] labels = [] with open(dataFile, 'rb') as f: next(f) for line in f: line = line.decode(encoding='utf-8', errors='strict') line = line.strip() if (line == ""): continue row = line.split(delim) t_id= row[2].strip().lower() txt = row[3].strip().lower() txt = aidrtokenize.tokenize(txt) # txt = remove_stop_words(txt, stop_words) label = row[6] if (len(txt) < 1): print (txt) continue # if(isinstance(txt, str)): data.append(txt) labels.append(label) id_list.append(t_id) print(len(data)) data_shuf = [] lab_shuf = [] index_shuf = list(range(len(data))) random.shuffle(index_shuf) for i in index_shuf: data_shuf.append(data[i]) lab_shuf.append(labels[i]) le = train_le # preprocessing.LabelEncoder() yL = le.transform(labels) labels = list(le.classes_) label = yL.tolist() yC = len(set(label)) yR = len(label) y = np.zeros((yR, yC)) y[np.arange(yR), yL] = 1 y = np.array(y, dtype=np.int32) sequences = tokenizer.texts_to_sequences(data) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) return data, y, le, labels, word_index,id_list
def graph_dist(tweetlist, model, outFile): of = open(outFile, "w") model.init_sims( replace=True) # Normalizes the vectors in the word2vec class. #rowVector=[] index = 0 for tweetR in tweetlist: rVec = aidrtokenize.tokenize(tweetR) rVec = [w for w in rVec if w not in stop_words] colVector = [] for tweetC in tweetlist: cVec = aidrtokenize.tokenize(tweetC) cVec = [w for w in cVec if w not in stop_words] distance = model.wmdistance(rVec, cVec) colVector.append(distance) vector = str(index) + " " for val in colVector: vector = vector + str(1 - val) + " " of.write(vector + "\n") #rowVector.append(colVector) of.close()
def graph_sim(tweetlist, model, outFile): print("Number of tweets to generate the graph: " + str(len(tweetlist))) of = open(outFile, "w") model.init_sims( replace=True) # Normalizes the vectors in the word2vec class. rowVector = [] for tweetR in tweetlist: rVec = aidrtokenize.tokenize(tweetR) rVec = [w for w in rVec if w not in stop_words] rowVector.append(rVec) instance = WmdSimilarity(rowVector, model, num_best=None) #index=0; print("Writing into the file....") for index, colVector in enumerate(instance): vector = "" for i, val in enumerate(colVector): if (index != i and val >= 0.3): #print str(index)+" != "+str(i) vector = vector + str(i) + " " of.write(str(index) + " " + vector.strip() + "\n") of.close()
def text_proprecess(text): txt = text.strip().lower() txt = give_emoji_free_text(txt) txt = aidrtokenize.tokenize(txt) return txt
def read_train_data_multimodal(data_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, label_index, delim): """ Prepare the data """ data = [] image_list = [] lab = [] with open(data_file, 'rb') as f: next(f) for line in f: line = line.decode(encoding='utf-8', errors='strict') line = line.strip() if (line == ""): continue row = line.split(delim) txt = row[3].strip() image_path = str(row[4].strip()) label = str(row[int(label_index)]) txt = aidrtokenize.tokenize(txt) text = " ".join(txt) if (len(txt) < 1): print("TEXT SIZE:" + str(txt)) continue data.append(text) lab.append(label) image_list.append(image_path) counts = Counter(lab) print(counts) print(len(data)) data_shuf = [] lab_shuf = [] image_list_shuf = [] index_shuf = range(len(data)) random.shuffle(index_shuf) for i in index_shuf: data_shuf.append(data[i]) lab_shuf.append(lab[i]) image_list_shuf.append(image_list[i]) #print(data[0]) le = preprocessing.LabelEncoder() yL = le.fit_transform(lab_shuf) labels = list(le.classes_) print("training classes: " + " ".join(labels)) label = yL.tolist() yC = len(set(label)) yR = len(label) y = np.zeros((yR, yC)) y[np.arange(yR), yL] = 1 y = np.array(y, dtype=np.int32) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="OOV_TOK") tokenizer.fit_on_texts(data_shuf) sequences = tokenizer.texts_to_sequences(data_shuf) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) # print('Shape of label tensor:', labels.shape) # return data,labels,word_index,dim; return data, image_list_shuf, y, le, labels, word_index, tokenizer
def read_dev_data_multimodal(data_file, tokenizer, MAX_SEQUENCE_LENGTH, label_index, delim): """ Prepare the data """ ids = [] data = [] image_list = [] lab = [] with open(data_file, 'rb') as f: next(f) for line in f: line = line.decode(encoding='utf-8', errors='strict') line = line.strip() if (line == ""): continue row = line.split(delim) image_id = row[2].strip() txt = row[3].strip() image_path = str(row[4].strip()) label = str(row[int(label_index)]) if (len(txt) < 1): print("TEXT SIZE:" + txt) continue txt = aidrtokenize.tokenize(txt) text = " ".join(txt) if (len(txt) < 1): print("TEXT SIZE:" + txt) continue data.append(text) lab.append(label) image_list.append(image_path) ids.append(image_id) counts = Counter(lab) print(counts) print(len(data)) data_shuf = [] lab_shuf = [] image_list_shuf = [] ids_shuf = [] index_shuf = range(len(data)) random.shuffle(index_shuf) for i in index_shuf: data_shuf.append(data[i]) lab_shuf.append(lab[i]) image_list_shuf.append(image_list[i]) ids_shuf.append(ids[i]) le = preprocessing.LabelEncoder() yL = le.fit_transform(lab_shuf) labels = list(le.classes_) print("training classes: " + " ".join(labels)) label = yL.tolist() yC = len(set(label)) yR = len(label) y = np.zeros((yR, yC)) y[np.arange(yR), yL] = 1 y = np.array(y, dtype=np.int32) sequences = tokenizer.texts_to_sequences(data_shuf) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') print('Shape of data tensor:', data.shape) return data, image_list_shuf, y, le, labels, ids_shuf