def __read_datas__(fname, tokenizer,fname1): fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') lines = fin.readlines() fin.close() fin = open(fname1, 'r', encoding='utf-8', newline='\n', errors='ignore') lines1 = fin.readlines() fin.close() fin = open(fname+'.graph', 'rb') fin1 = open(fname+'.edgevocab', 'rb') idx2gragh = pickle.load(fin) edgevocab=pickle.load(fin1) fin1.close() fin.close() all_data = [] for i in tqdm.tqdm(range(0, len(lines), 1)): # text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] # aspect = lines[i + 1].lower().strip() # text_left = [s.lower().strip() for s in lines[i].split("$T$")] # aspect = lines[i + 1].lower().strip() pola = lines1[i].strip() # span_indices=span(text_left,aspect) # assert len(span_indices)>=1 concats=re.sub(r' {2,}',' ',lines[i].lower().strip()) text_indices, tran_indices = tokenizer.text_to_sequence(concats,True) # context_indices = tokenizer.text_to_sequence(concats) # aspect_indices = tokenizer.text_to_sequence(aspect) # left_indices = tokenizer.text_to_sequence(concats) if pola=='negative': polarity=0 elif pola=='neutral': polarity=1 else:polarity=2 dependency_graph = idx2gragh[i] assert len(idx2gragh[i][0])==len(tokenize(concats)) # print(tokenize(concats)) # print(span_indices) # print(aspect) # a=input('fdfdf') data = { 'text': tokenize(concats.lower().strip()), 'aspect': None, 'text_indices': text_indices, 'tran_indices': tran_indices, 'context_indices': None, 'span_indices': None, 'aspect_indices': None, 'left_indices': None, 'polarity': polarity, 'dependency_graph': dependency_graph, } all_data.append(data) return all_data
def span(texts,aspect): startid=0 aslen=len(tokenize(aspect)) spans=[] for idx,text in enumerate(texts): tmp=len(tokenize(text)) startid+=tmp tmp=startid if idx < len(texts)-1: startid+=aslen spans.append([tmp,startid]) return spans
def __read_data__(fname, tokenizer,fname1): fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') lines = fin.readlines() fin.close() fin = open(fname+'.graph', 'rb') fin1 = open(fname1+'.edgevocab', 'rb') idx2gragh = pickle.load(fin) edgevocab=pickle.load(fin1) fin1.close() fin.close() all_data = [] for i in tqdm.tqdm(range(0, len(lines), 3)): # text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] # aspect = lines[i + 1].lower().strip() text_left = [s.lower().strip() for s in lines[i].split("$T$")] aspect = lines[i + 1].lower().strip() polarity = lines[i + 2].strip() span_indices=span(text_left,aspect) assert len(span_indices)>=1 concats=concat(text_left,aspect) text_indices, tran_indices = tokenizer.text_to_sequence(concats,True) context_indices = tokenizer.text_to_sequence(concats) aspect_indices = tokenizer.text_to_sequence(aspect) left_indices = tokenizer.text_to_sequence(concats) polarity = int(polarity)+1 dependency_graph = idx2gragh[i] assert len(idx2gragh[i][0])==len(tokenize(concats)) # print(tokenize(concats)) # print(span_indices) # print(aspect) # a=input('fdfdf') data = { 'text': tokenize(concats.lower().strip()), 'aspect': tokenize(aspect), 'text_indices': text_indices, 'tran_indices': tran_indices, 'context_indices': context_indices, 'span_indices': span_indices, 'aspect_indices': aspect_indices, 'left_indices': left_indices, 'polarity': polarity, 'dependency_graph': dependency_graph, } all_data.append(data) return all_data
def fit_on_text(self, text): text = text.lower().strip() words = tokenize(text) for word in words: if word not in self.word2idx: self.word2idx[word] = self.idx self.idx2word[self.idx] = word self.idx += 1
def text_to_sequence(self, text, tran=False): text = text.lower().strip() words = tokenize(text) trans=[] realwords=[] for word in words: wordpieces=self.tokenizer._tokenize(word) tmplen=len(realwords) realwords.extend(wordpieces) trans.append([tmplen,len(realwords)]) # unknownidx = 1_convert_token_to_id sequence = [self.tokenizer._convert_token_to_id('[CLS]')]+[self.tokenizer._convert_token_to_id(w) for w in realwords]+[self.tokenizer._convert_token_to_id('[SEP]')] if len(sequence) == 0: sequence = [0] if tran: return sequence,trans return sequence