class UDPipeTokenizer: def __init__(self, udpipe_model_path): self.udpipe_model = Model(udpipe_model_path) def tokenize(self, sentence: str) -> List[Tuple[str, str]]: """ return: list of pairs of tags (POS, DEP_REL) for each token in the sentence """ s = list(self.udpipe_model.process(sentence)) lst = [(item.upostag, item.deprel) for item in s[0].words if item.upostag != '<root>'] return lst
class UdpipeTagger: def __init__(self, file = None, **kwarg): if file: self.model = Model(file) else: raise Exception("You should pass the model") def get_pos_tag(self, word): sent = list(self.model.process(word))[0] if len(sent.words) != 2: print(word, sent.words) return sent.words[1].xpostag
def do_train(self) -> List[TResult]: """ By pre-train modules of unpipe get the results for our corpus These udpipe modules can be download here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131 :return: """ model = Model(self.pre_model_name) # train our corpus to get POS for each word line_no = 1 for sen in self.load_data(): sen_clean = self.clean_data(sen) if not sen_clean: continue word_pos = list(model.process(sen_clean)) for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) self.store_data.insert_data(self.cursor, results, self.language_name) print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name)) line_no += 1 print(' all written succeed for corpus of %s' % self.our_corpus_name)
def corpy_udpipe(text, sent_level=True, model='english-lines-ud-2.5-191206.udpipe'): m = Model('../udpipe_model/' + model) print(model, "loaded successfully!") if sent_level: all_pos = [] all_head = [] all_dep = [] all_tok = [] for line in text: #print(line) sent_pos = [] sent_head = [] sent_dep = [] sent_tok = [] sents = list(m.process(line, out_format="conllu")) conllu = "".join(sents) parse_con = parse(conllu) # iterate over each word and append the POS/HEAD/UD into a list, #print(parse_con[0]) for i in range(len(parse_con)): for word in parse_con[i]: #print(i) sent_pos.append(word['upostag']) sent_head.append(word['head']) sent_dep.append(word['deprel']) sent_tok.append(word['form']) # append sent pos to the the doc all_pos.append(sent_pos) all_head.append(sent_head) all_dep.append(sent_dep) all_tok.append(sent_tok) # for doc-level else: all_pos = [] all_head = [] all_dep = [] all_tok = [] for doc in text: pos_per_doc = [] head_per_doc = [] dep_per_doc = [] tok_per_doc = [] for line in doc: #print(line) sent_pos = [] sent_head = [] sent_dep = [] sent_tok = [] sents = list(m.process(line, out_format="conllu")) conllu = "".join(sents) parse_con = parse(conllu) # iterate over each word and append the POS/HEAD/UD into a list, #print(parse_con[0]) for i in range(len(parse_con)): for word in parse_con[i]: #print(i) sent_pos.append(word['upostag']) sent_head.append(word['head']) sent_dep.append(word['deprel']) sent_tok.append(word['form']) # append sent pos to the the doc pos_per_doc.append(sent_pos) head_per_doc.append(sent_head) dep_per_doc.append(sent_dep) tok_per_doc.append(sent_tok) all_pos.append(pos_per_doc) all_head.append(head_per_doc) all_dep.append(dep_per_doc) all_tok.append(tok_per_doc) return all_pos, all_head, all_dep, all_tok
class UdpipeTrain(ITrain): def __init__(self, language_name, pre_model_name, our_corpus_name): """ The language of pre_model_name and our_corpus_name should be identical! :param language_name: :param pre_model_name: it's from udpipe :param our_corpus_name: it's our found """ self.language_name = language_name self.pre_model_name = pre_model_name self.our_corpus_name = our_corpus_name try: self.store_data = StoreData(db_config['user'], db_config['password'], db_host=db_config['db_host'], db_name=db_config['db_name']) self.cursor = self.store_data.db_connect().cursor() # second loading udpipe pre-train model self.model = Model(self.pre_model_name) except Exception as ex: print('logging in database error %s' % ex) def load_data(self) -> str: with open(self.our_corpus_name, 'r') as f: for sen in f: print('loading one sentence: %s' % (sen,)) yield sen print('loading done for our corpus') def clean_data(self, data: str) -> str: """ data is one or several sentence(s) we expect if data is \n, \t, empty str, etc, replace them :param data: raw data :return: data after cleaning """ cleaned_data = re.sub('[\n\t]+', '', data) return cleaned_data def do_train(self) -> List[TResult]: """ By pre-train modules of unpipe get the results for our corpus These udpipe modules can be download here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131 :return: """ # train our corpus to get POS for each word line_no = 1 for sen in self.load_data(): # if line_no < 1811: # line_no += 1 # continue sen_clean = self.clean_data(sen) if not sen_clean: continue word_pos = list(self.model.process(sen_clean)) # pprint(word_pos) for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) self.store_data.insert_data(self.cursor, results, self.language_name) print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name)) line_no += 1 print(' all written succeed for corpus of %s' % self.our_corpus_name) def extract_one_sentence(self, sentence) -> str: """ This private method is mainly used to extract the sentence text. an instance of udpipe Sentence: Sentence( comments=[ '# sent_id = 3', '# text = 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。'], words=[ Word(id=0, <root>), Word(id=1, form='黄土', lemma='黄土', xpostag='NNP', upostag='PROPN', head=3, deprel='nmod', misc='SpaceAfter=No'), Word(id=2, form='高原', lemma='高原', xpostag='NN', upostag='NOUN', head=3, deprel='nmod', misc='SpaceAfter=No'), Word(id=3, form='严寒', lemma='严寒', xpostag='NN', upostag='NOUN', head=22, deprel='nsubj', misc='SpaceAfter=No'), omited by myself ]) :param sentence: udpipe Sentence :return: str 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。 """ comment = ''.join(sentence.comments) try: cs = re.findall(r'text = (.*)', comment)[0] return cs except Exception as e: # TODO: need to write warning log print('error: not find a sentence', e) return '' def extract_one_word(self, sentence, sentence_text: str) -> [TResult]: """ This private method is mainly used to extract one word and it's POS :param sentence_text: :param sentence: :return: [TResult] """ r = [] for word in sentence.words: if word.lemma and word.lemma not in ITrain.FILTER_WORD: if word.lemma and word.upostag and sentence_text: r.append(TResult(word.lemma, word.upostag, sentence_text)) return r def word_segmentation(self, sentence) -> List[str]: """ :param sentence: :return: word list """ sen_clean = self.clean_data(sentence) if not sen_clean: return [] word_pos = list(self.model.process(sen_clean)) words = [] for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) words.extend([res.word for res in results]) return words
'rb') as f: for line in f.readlines(): data.append(str(line, 'utf-8')) model = 'german-hdt-ud-2.5-191206.udpipe' m = Model('../udpipe_model/' + model) print(model, "loaded successfully!") all_pos = [] for line in data: #print(line) sent_pos = [] sents = list(m.process(line, out_format="conllu")) conllu = "".join(sents) parse_con = parse(conllu) # iterate over each word and append the POS into a list, for i in parse_con[0]: #print(i) sent_pos.append(i['upostag']) # append sent pos to the the doc all_pos.append(sent_pos) # write pos list into a file with open('de_pos', 'wb') as f:
from corpy.udpipe import Model from corpy.udpipe import pprint m = Model("/home/zglg/SLU/psd/pre-model/classical_chinese-kyoto-ud-2.5-191206.udpipe") sents = list(m.process("我爱北京天安门. 天安门上好风景")) pprint(sents)