Python tokenize Exemples, dependency_graphbiedges.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

    def __read_datas__(fname, tokenizer,fname1):
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines = fin.readlines()
        fin.close()
        fin = open(fname1, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines1 = fin.readlines()
        fin.close()
        fin = open(fname+'.graph', 'rb')
        fin1 = open(fname+'.edgevocab', 'rb')
        idx2gragh = pickle.load(fin)
        edgevocab=pickle.load(fin1)
        fin1.close()
        fin.close()


        all_data = []
        for i in tqdm.tqdm(range(0, len(lines), 1)):
#            text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
#            aspect = lines[i + 1].lower().strip()
#            text_left = [s.lower().strip() for s in lines[i].split("$T$")]
#            aspect = lines[i + 1].lower().strip()
            pola = lines1[i].strip()
#            span_indices=span(text_left,aspect)
#            assert len(span_indices)>=1
            concats=re.sub(r' {2,}',' ',lines[i].lower().strip())
            text_indices, tran_indices = tokenizer.text_to_sequence(concats,True)
#            context_indices = tokenizer.text_to_sequence(concats)
#            aspect_indices = tokenizer.text_to_sequence(aspect)
#            left_indices = tokenizer.text_to_sequence(concats)
            if pola=='negative':
                polarity=0
            elif pola=='neutral':
                    polarity=1
            else:polarity=2
                
            dependency_graph = idx2gragh[i]
            assert len(idx2gragh[i][0])==len(tokenize(concats))
#            print(tokenize(concats))
#            print(span_indices)
#            print(aspect)
#            a=input('fdfdf')
            data = {
                'text': tokenize(concats.lower().strip()),
                'aspect': None,
                'text_indices': text_indices,
                'tran_indices': tran_indices,
                'context_indices': None,
                'span_indices': None,
                'aspect_indices': None,
                'left_indices': None,
                'polarity': polarity,
                'dependency_graph': dependency_graph,
            }

            all_data.append(data)
        return all_data

Exemple #2

0

Afficher le fichier

def span(texts,aspect):
    startid=0
    aslen=len(tokenize(aspect))
    spans=[]
    for idx,text in enumerate(texts):
        tmp=len(tokenize(text))
        startid+=tmp
        tmp=startid
        if idx < len(texts)-1:
            startid+=aslen
            spans.append([tmp,startid])
    return spans

Exemple #3

0

Afficher le fichier

    def __read_data__(fname, tokenizer,fname1):
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines = fin.readlines()
        fin.close()
        fin = open(fname+'.graph', 'rb')
        fin1 = open(fname1+'.edgevocab', 'rb')
        idx2gragh = pickle.load(fin)
        edgevocab=pickle.load(fin1)
        fin1.close()
        fin.close()


        all_data = []
        for i in tqdm.tqdm(range(0, len(lines), 3)):
#            text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
#            aspect = lines[i + 1].lower().strip()
            text_left = [s.lower().strip() for s in lines[i].split("$T$")]
            aspect = lines[i + 1].lower().strip()
            polarity = lines[i + 2].strip()
            span_indices=span(text_left,aspect)
            assert len(span_indices)>=1
            concats=concat(text_left,aspect)
            text_indices, tran_indices = tokenizer.text_to_sequence(concats,True)
            context_indices = tokenizer.text_to_sequence(concats)
            aspect_indices = tokenizer.text_to_sequence(aspect)
            left_indices = tokenizer.text_to_sequence(concats)
            polarity = int(polarity)+1
            dependency_graph = idx2gragh[i]
            assert len(idx2gragh[i][0])==len(tokenize(concats))
#            print(tokenize(concats))
#            print(span_indices)
#            print(aspect)
#            a=input('fdfdf')
            data = {
                'text': tokenize(concats.lower().strip()),
                'aspect': tokenize(aspect),
                'text_indices': text_indices,
                'tran_indices': tran_indices,
                'context_indices': context_indices,
                'span_indices': span_indices,
                'aspect_indices': aspect_indices,
                'left_indices': left_indices,
                'polarity': polarity,
                'dependency_graph': dependency_graph,
            }

            all_data.append(data)
        return all_data

Exemple #4

0

Afficher le fichier

 def fit_on_text(self, text):
     text = text.lower().strip()
     words = tokenize(text)
     for word in words:
         if word not in self.word2idx:
             self.word2idx[word] = self.idx
             self.idx2word[self.idx] = word
             self.idx += 1

Exemple #5

0

Afficher le fichier

    def text_to_sequence(self, text, tran=False):
        text = text.lower().strip()
        words = tokenize(text)
        trans=[]
        realwords=[]
        for word in words:
            wordpieces=self.tokenizer._tokenize(word)
            tmplen=len(realwords)
            
            realwords.extend(wordpieces)
            trans.append([tmplen,len(realwords)])
#        unknownidx = 1_convert_token_to_id
        sequence = [self.tokenizer._convert_token_to_id('[CLS]')]+[self.tokenizer._convert_token_to_id(w) for w in realwords]+[self.tokenizer._convert_token_to_id('[SEP]')]
        if len(sequence) == 0:
            sequence = [0]
        if tran: return sequence,trans
        return sequence