def Load_Each_Data(self, path=None, shuffle=False): assert path is not None, "The Data Path Is Not Allow Empty." insts = [] with open(path, encoding="UTF-8") as f: inst = Instance() now_line = 0 for line in f.readlines(): now_line += 1 sys.stdout.write("\rhandling with the {} line".format(now_line)) line = line.strip() if line == "" and len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) inst = Instance() else: line = line.strip().split(" ") # print(line) assert len(line) == 3, "Error Format" word = line[0] # word = self.clean_str(word) # if word == " ": # print("rrrrrrrrrrrrrr", word) # continue # print("\n" + word) inst.words.append(word.lower()) inst.labels.append(line[2]) # if now_line == 36 * 16: # break if len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) print("\n") return insts
def _Load_Each_Data(self, path=None, shuffle=False): """ :param path: :param shuffle: :return: """ assert path is not None, "The Data Path Is Not Allow Empty." insts = [] with open(path, encoding="UTF-8") as f: inst = Instance() for line in f.readlines(): line = line.strip() if line == "" and len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) inst = Instance() else: line = line.strip().split(" ") ## line:['EU'. 'S-ORG'] word = line[0] char = self._add_char(word)## char是个列表,是每个单词的字母组成 word = self._normalize_word(word)##返回的是一个字符串,就是把输入word字符串中的数字转为'0':engl7sh->engl0sh inst.chars.append(char)## inst.chars本身是一个list,把现在这个单词对应的char列表加进去 inst.words.append(word)## inst.words本身是一个lsit,把修改之后的word加进去 inst.labels.append(line[-1]) if len(insts) == self.max_count:## 控制读取数据量,一般设置为-1,也就是读取全部数据 break if len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) # print("\n") return insts
def _Load_Each_Data(self, path=None, shuffle=False): """ :param path: :param shuffle: :return: """ assert path is not None, "The Data Path Is Not Allow Empty." insts = [] with open(path, encoding="UTF-8") as f: inst = Instance() for line in f.readlines(): line = line.strip() if line == "" and len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) inst = Instance() else: line = line.strip().split(" ") word = line[0] char = self._add_char(word) word = self._normalize_word(word) inst.chars.append(char) inst.words.append(word) inst.labels.append(line[-1]) if len(insts) == self.max_count: break if len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) # print("\n") return insts
def Load_Each_Data(self, path=None, shuffle=False): assert path is not None, "The Data Path Is Not Allow Empty." insts = [] with open(path, encoding="UTF-8") as f: inst = Instance() now_line = 0 for line in f.readlines(): now_line += 1 sys.stdout.write("\rhandling with the {} line".format(now_line)) line = line.strip() # print(line) if line == "" and len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) inst = Instance() elif line == "": continue else: # line = self.clean_str(line) line = line.strip().split(" ") # print(line) assert len(line) == 2, "Error Format" # if len(line) != 2: # continue word = line[0] if word == "-DOCSTART-": continue # word = self.clean_conll(word) # if word == "": # continue # # if line[1] == "O": # # continue # if (not word[0].isalpha()) and line[1][0] == "I": # continue # if (not word[0].isalpha()) and line[1][0] == "O": # continue inst.words.append(word.lower()) inst.labels.append(line[1]) # if len(insts) == 2560: # break if len(inst.words) != 0: inst.words_size = len(inst.words) insts.append(inst) print("\n") return insts
def _Load_Each_JsonData(self, path=None, path_id=0, train=False): assert path is not None, "The Data Path Is Not Allow Empty." insts = [] now_lines = 0 # print() with open(path, encoding="UTF-8") as f: lines = f.readlines() for line in lines: now_lines += 1 if now_lines % 2000 == 0: sys.stdout.write( "\rreading the {} line\t".format(now_lines)) if line == "\n": print("empty line") inst = Instance() line_json = json.loads(line) fact = line_json["fact"].split()[:self.max_train_len] bert_line = "".join(fact) # accu label accu = line_json["meta"]["accusation"] # print(accu) # law label law = line_json["meta"]["relevant_articles"] inst.words = fact inst.bert_line = bert_line[:self.bert_max_char_length] inst.accu_labels = accu inst.law_labels = law inst.words_size = len(inst.words) inst.accu_labels_size = len(inst.accu_labels) inst.law_labels_size = len(inst.law_labels) insts.append(inst) if len(insts) == self.max_count: break sys.stdout.write("\rreading the {} line\t".format(now_lines)) if self.use_bert: insts = self._read_bert_file(insts, path=self.bert_path[path_id]) return insts
def _Load_Each_Data(self, path=None, shuffle=False): """ :param path: :param shuffle: :return: """ #pinyin1 = Pinyin() assert path is not None, "The Data Path Is Not Allow Empty." insts = [] now_lines = 0 with open(path, encoding="UTF-8") as f: inst = Instance() data = [line.strip().split() for line in f] for line in data: #line = line.strip() inst = Instance() #line = line.split() label = line[0] #word = line[2:-1] # No LDA word = line[2:] # Have LDA #pinyin = pinyin1.get_pinyin(word," ", tone_marks= None) #word = " ".join(line[1:]) inst.words = word inst.labels.append(label) inst.words_size = len(inst.words) #inst.pinyin = pinyin insts.append(inst) if len(insts) == self.max_count: break ''' 得到原始的数据,可只用于test中 或者将它写入txt 对比 print('$$$$$$$$$$$$$$$$$') for k in insts: print(k.labels, '****', k.words) print('################') ''' return insts
def _Load_Each_Data(self, path=None, path_id=None): """ :param path: :param shuffle: :return: """ assert path is not None, "The Data Path Is Not Allow Empty." insts = [] now_lines = 0 with open(path, encoding="UTF-8") as f: inst = Instance() for line in f.readlines(): line = line.strip() now_lines += 1 if now_lines % 200 == 0: sys.stdout.write("\rreading the {} line\t".format(now_lines)) if line == "\n": print("empty line") inst = Instance() line = line.split() label = line[0] word = " ".join(line[1:]) if label not in ["0", "1"]: print("Error line: ", " ".join(line)) continue inst.words = self._clean_str(word).split() inst.labels.append(label) inst.words_size = len(inst.words) insts.append(inst) if len(insts) == self.max_count: break # print("\n") if self.use_bert: insts = self._read_bert_file(insts, path=self.bert_path[path_id]) return insts
def _Load_Each_Data(self, path=None, shuffle=False): """ :param path: :param shuffle: :return: """ assert path is not None, "The Data Path Is Not Allow Empty." insts = [] with open(path, encoding="UTF-8") as f: lines = f.readlines() for index, line in enumerate(lines): # copy with "/n" line = unicodedata.normalize('NFKC', line.strip()) # init instance inst = Instance() line = line.split(" ") inst.line = " ".join(line) # print(inst.line) # print(line) count = 0 for word_pos in line: # segment the word and pos in line word, _, label = word_pos.partition("_") word_length = len(word) inst.words.append(word) inst.gold_seg.append("[" + str(count) + "," + str(count + word_length) + "]") inst.gold_pos.append("[" + str(count) + "," + str(count + word_length) + "]" + label) count += word_length for i in range(word_length): char = word[i] # print(char) inst.chars.append(char) if i == 0: inst.gold.append(sep + "#" + label) inst.pos.append(label) else: inst.gold.append(app) char_number = len(inst.chars) for i in range(char_number): # copy with the left bichars if i is 0: inst.bichars_left.append(nullkey + inst.chars[i]) else: inst.bichars_left.append(inst.chars[i - 1] + inst.chars[i]) # copy with the right bichars if i == char_number - 1: inst.bichars_right.append(inst.chars[i] + nullkey) else: inst.bichars_right.append(inst.chars[i] + inst.chars[i + 1]) # char/word size inst.chars_size = len(inst.chars) inst.words_size = len(inst.words) inst.bichars_size = len(inst.bichars_left) inst.gold_size = len(inst.gold) # add one inst that represent one sentence into the list insts.append(inst) if len(insts) == self.max_count: break return insts