Exemple #1
0
 def generate_batch(self):
     start = self.start
     end = self.end
     self.texts_train = np.array([])
     self.labels_train = np.array([])
     data_split = self.ids[start:end]
     #print("Batch: ", len(data_split))
     for i in range(0, len(data_split)):
         index = int(data_split[i])
         text = self.texts[index]
         labels = self.labels[index][0]
         split_labels = labels.split(" ")
         labels_temp = np.zeros(config.label_size)
         for j in range(1, len(split_labels)):
             try:
                 label_index = utils.find_label_index(split_labels[j])
                 labels_temp[label_index] = 1.0
             except ValueError:
                 print("Not have label: ", split_labels[j])
         self.labels_train = np.append(self.labels_train, labels_temp)
         self.texts_train = np.append(self.texts_train, text)
Exemple #2
0
    def generate_embedding(self):
        start = self.start
        end = self.end
        self.texts_train = []
        self.labels_train = []
        data_split = self.ids[start:end]
        for i in range(0, len(data_split)):
            ids_index = data_split[i][0].split(" ")
            id = int(ids_index[0])
            index = int(ids_index[1])
            labels = self.labels[index][0]
            split_labels = labels.split(" ")
            labels_temp = np.zeros(config.label_size)
            for j in range(1, len(split_labels)):
                try:
                    label_index = utils.find_label_index(split_labels[j])
                    labels_temp[label_index] = 1.0
                except ValueError:
                    print("Not have label: ", split_labels[j])
            self.labels_train.append(labels_temp)

            self.texts_train.append(self.texts[index])
Exemple #3
0
 def generate_batch_hot(self):
     start = self.start
     end = self.end
     self.texts_train = []
     self.labels_train = []
     data_split = self.ids[start:end]
     for i in range(0, len(data_split)):
         #print(data_split[i])
         ids_index = data_split[i][0].split(" ")
         id = int(ids_index[0])
         index = int(ids_index[1])
         labels = self.labels[index][0]
         split_labels = labels.split(" ")
         labels_temp = np.zeros(config.label_size)
         for j in range(1, len(split_labels)):
             try:
                 label_index = utils.find_label_index(split_labels[j])
                 labels_temp[label_index] = 1.0
             except ValueError:
                 print("Not have label: ", split_labels[j])
         self.labels_train.append(labels_temp)
         text_name = str(id) + "newsML.xml"
         #reuters = et.parse("data/rcv1-2/train-text/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot()
         reuters = et.parse("data/rcv1-2/test-text0/" + text_name,
                            et.XMLParser(encoding='ISO-8859-1')).getroot()
         temp_text = ""
         for text in reuters.findall("title"):
             #print(text.text)
             temp_text = temp_text + text.text.replace(" ", "")
         for text in reuters.findall("text"):
             for p in text.findall("p"):
                 temp_text = temp_text + p.text.replace(" ", "").replace(
                     "\t", "")
         #print("ID TExt: ", id)
         #print(temp_text)
         matrix = utils.one_hot_encoder(temp_text)
         self.texts_train.append(matrix)
 def generate_batch_stemm_test(self):
     start = self.start_test
     end = self.end_test
     self.texts_train = np.array([])
     self.labels_train = np.array([])
     data_split = self.ids[start:end]
     for i in range(0, len(data_split)):
         ids_index = data_split[i][0].split(" ")
         id = int(ids_index[0])
         index = int(ids_index[1])
         labels = self.labels[index][0]
         split_labels = labels.split(" ")
         labels_temp = np.zeros(config.label_size)
         for j in range(1, len(split_labels)):
             try:
                 label_index = utils.find_label_index(split_labels[j])
                 labels_temp[label_index] = 1.0
             except ValueError:
                 print("Not have label: ", split_labels[j])
         self.labels_train = np.append(self.labels_train, labels_temp)
         text_name = str(id) + "token.txt"
         with open("data/rcv1-2/test-tokens0-0/" + text_name, 'r') as f:
             temp_text = f.read()
             self.texts_train = np.append(self.texts_train, temp_text)
	def read_data(self, name, type = 1):
		#print "extract: " + self.path_data + name
		ruta = ""
		if type == 1:
			ruta = self.path_data + "train/" + name
		elif type == 2:
			ruta = self.path_data + "test/" + name
		elif type == 3:
			ruta = self.path_data + "first3/" + name
		reuters = et.parse(ruta, et.XMLParser(encoding='ISO-8859-1')).getroot()
		extract_labels = False
		#print reuters
		#for reuters in xml.findall('REUTERS'):
		#	print reuters
		matrix = []
		for text in reuters.findall("TEXT"):
			body = utils.extract_body(text)
			if body != "" and body != None:
				extract_labels = True
			#if extract_labels == True:
				labels_temp = np.zeros(config.label_size)
				all_labels = 0
				for a_topic in reuters.findall("TOPICS"):
					for a_d in a_topic.findall("D"):
						try:
							label_index = utils.find_label_index(a_d.text)
							labels_temp[label_index] = 1.0
							self.label_examples[label_index] += 1
							all_labels += 1
						except ValueError:
							extract_labels = True
				for a_topic in reuters.findall("PLACES"):
					for a_d in a_topic.findall("D"):
						try:
							label_index = utils.find_label_index(a_d.text)
							labels_temp[label_index] = 1.0
							self.label_examples[label_index] += 1
							all_labels += 1
						except ValueError:
							extract_labels = True
				for a_topic in reuters.findall("PEOPLE"):
					for a_d in a_topic.findall("D"):
						try:
							label_index = utils.find_label_index(a_d.text)
							labels_temp[label_index] = 1.0
							self.label_examples[label_index] += 1
							all_labels += 1
						except ValueError:
							extract_labels = True
				for a_topic in reuters.findall("ORGS"):
					for a_d in a_topic.findall("D"):
						try:
							label_index = utils.find_label_index(a_d.text)
							labels_temp[label_index] = 1.0
							self.label_examples[label_index] += 1
							all_labels += 1
						except ValueError:
							extract_labels = True
				for a_topic in reuters.findall("EXCHANGES"):
					for a_d in a_topic.findall("D"):
						try:
							label_index = utils.find_label_index(a_d.text)
							labels_temp[label_index] = 1.0
							self.label_examples[label_index] += 1
							all_labels += 1
						except ValueError:
							extract_labels = True
				if all_labels != 0:
					#print("READ...")
					self.labels_train = np.append(self.labels_train, labels_temp)
					self.texts_train = np.append(self.texts_train, utils.stop_characters(body.text))
					extract_labels = False
				else:
					extract_labels = False