def generate_batch(self): start = self.start end = self.end self.texts_train = np.array([]) self.labels_train = np.array([]) data_split = self.ids[start:end] #print("Batch: ", len(data_split)) for i in range(0, len(data_split)): index = int(data_split[i]) text = self.texts[index] labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train = np.append(self.labels_train, labels_temp) self.texts_train = np.append(self.texts_train, text)
def generate_embedding(self): start = self.start end = self.end self.texts_train = [] self.labels_train = [] data_split = self.ids[start:end] for i in range(0, len(data_split)): ids_index = data_split[i][0].split(" ") id = int(ids_index[0]) index = int(ids_index[1]) labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train.append(labels_temp) self.texts_train.append(self.texts[index])
def generate_batch_hot(self): start = self.start end = self.end self.texts_train = [] self.labels_train = [] data_split = self.ids[start:end] for i in range(0, len(data_split)): #print(data_split[i]) ids_index = data_split[i][0].split(" ") id = int(ids_index[0]) index = int(ids_index[1]) labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train.append(labels_temp) text_name = str(id) + "newsML.xml" #reuters = et.parse("data/rcv1-2/train-text/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot() reuters = et.parse("data/rcv1-2/test-text0/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot() temp_text = "" for text in reuters.findall("title"): #print(text.text) temp_text = temp_text + text.text.replace(" ", "") for text in reuters.findall("text"): for p in text.findall("p"): temp_text = temp_text + p.text.replace(" ", "").replace( "\t", "") #print("ID TExt: ", id) #print(temp_text) matrix = utils.one_hot_encoder(temp_text) self.texts_train.append(matrix)
def generate_batch_stemm_test(self): start = self.start_test end = self.end_test self.texts_train = np.array([]) self.labels_train = np.array([]) data_split = self.ids[start:end] for i in range(0, len(data_split)): ids_index = data_split[i][0].split(" ") id = int(ids_index[0]) index = int(ids_index[1]) labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train = np.append(self.labels_train, labels_temp) text_name = str(id) + "token.txt" with open("data/rcv1-2/test-tokens0-0/" + text_name, 'r') as f: temp_text = f.read() self.texts_train = np.append(self.texts_train, temp_text)
def read_data(self, name, type = 1): #print "extract: " + self.path_data + name ruta = "" if type == 1: ruta = self.path_data + "train/" + name elif type == 2: ruta = self.path_data + "test/" + name elif type == 3: ruta = self.path_data + "first3/" + name reuters = et.parse(ruta, et.XMLParser(encoding='ISO-8859-1')).getroot() extract_labels = False #print reuters #for reuters in xml.findall('REUTERS'): # print reuters matrix = [] for text in reuters.findall("TEXT"): body = utils.extract_body(text) if body != "" and body != None: extract_labels = True #if extract_labels == True: labels_temp = np.zeros(config.label_size) all_labels = 0 for a_topic in reuters.findall("TOPICS"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("PLACES"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("PEOPLE"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("ORGS"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True for a_topic in reuters.findall("EXCHANGES"): for a_d in a_topic.findall("D"): try: label_index = utils.find_label_index(a_d.text) labels_temp[label_index] = 1.0 self.label_examples[label_index] += 1 all_labels += 1 except ValueError: extract_labels = True if all_labels != 0: #print("READ...") self.labels_train = np.append(self.labels_train, labels_temp) self.texts_train = np.append(self.texts_train, utils.stop_characters(body.text)) extract_labels = False else: extract_labels = False