def load_model(self): extension = "-ud-{0}.udpipe".format(UD_VERSION) udpipe_model = udpipe_models[self.language]+extension model_file = os.path.join("models", udpipe_model) if not os.path.exists(model_file): url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/{0}".format(udpipe_model) print (url) wget.download(url, out="models") return Model(model_file)
class UDPipeTokenizer: def __init__(self, udpipe_model_path): self.udpipe_model = Model(udpipe_model_path) def tokenize(self, sentence: str) -> List[Tuple[str, str]]: """ return: list of pairs of tags (POS, DEP_REL) for each token in the sentence """ s = list(self.udpipe_model.process(sentence)) lst = [(item.upostag, item.deprel) for item in s[0].words if item.upostag != '<root>'] return lst
def __init__(self, language_name, pre_model_name, our_corpus_name): """ The language of pre_model_name and our_corpus_name should be identical! :param language_name: :param pre_model_name: it's from udpipe :param our_corpus_name: it's our found """ self.language_name = language_name self.pre_model_name = pre_model_name self.our_corpus_name = our_corpus_name try: self.store_data = StoreData(db_config['user'], db_config['password'], db_host=db_config['db_host'], db_name=db_config['db_name']) self.cursor = self.store_data.db_connect().cursor() # second loading udpipe pre-train model self.model = Model(self.pre_model_name) self._word_count, self.MAX_WORD_COUNT = 0, 500000 except Exception as ex: print('logging in database error %s' % ex)
def do_train(self) -> List[TResult]: """ By pre-train modules of unpipe get the results for our corpus These udpipe modules can be download here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131 :return: """ model = Model(self.pre_model_name) # train our corpus to get POS for each word line_no = 1 for sen in self.load_data(): sen_clean = self.clean_data(sen) if not sen_clean: continue word_pos = list(model.process(sen_clean)) for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) self.store_data.insert_data(self.cursor, results, self.language_name) print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name)) line_no += 1 print(' all written succeed for corpus of %s' % self.our_corpus_name)
class UdpipeTagger: def __init__(self, file = None, **kwarg): if file: self.model = Model(file) else: raise Exception("You should pass the model") def get_pos_tag(self, word): sent = list(self.model.process(word))[0] if len(sent.words) != 2: print(word, sent.words) return sent.words[1].xpostag
def _check_tokenizer(self, lang): if lang not in self.models: model_path = os.path.join(FILE_PATH, "udpipe", "models", self._lang2tokenizer_modelname(lang)) self.models[lang] = Model(model_path)
def _check_model(self, lang): """ Check if model exists, is loaded, and load it if needed. """ if lang not in self.models: model_path = os.path.join(FILE_PATH, 'udpipe', 'models', self._lang2modelname(lang)) self.models[lang] = Model(model_path)
from text_to_data import Doc2Data from text_to_data import CalculatePair from model import train_model, get_data from progress.bar import Bar import os from corpy.udpipe import Model import stopwords from random import randint import numpy as np m = Model("russian-syntagrus-ud-2.5-191206.udpipe") stop = stopwords.get_stopwords('ru') # postfix can be _sm for demo corpus and _med for second part, _all for all corpus postfix = '_all' text_folder_name = 'texts' + postfix + '/' data_folder_name = 'data' + postfix + '/' def make_data_from_texts(): # goes through folder and process all texts in json all_texts = os.listdir(text_folder_name) for text in Bar(' text parsing...').iter(all_texts): Doc2Data(text_folder_name + text, m, stop, data_folder_name) def make_pairs(authors): all_texts = os.listdir(text_folder_name) texts = open('db' + postfix + '.csv', 'r').read().split('\n')[:authors] text = [] for i in texts:
def corpy_udpipe(text, sent_level=True, model='english-lines-ud-2.5-191206.udpipe'): m = Model('../udpipe_model/' + model) print(model, "loaded successfully!") if sent_level: all_pos = [] all_head = [] all_dep = [] all_tok = [] for line in text: #print(line) sent_pos = [] sent_head = [] sent_dep = [] sent_tok = [] sents = list(m.process(line, out_format="conllu")) conllu = "".join(sents) parse_con = parse(conllu) # iterate over each word and append the POS/HEAD/UD into a list, #print(parse_con[0]) for i in range(len(parse_con)): for word in parse_con[i]: #print(i) sent_pos.append(word['upostag']) sent_head.append(word['head']) sent_dep.append(word['deprel']) sent_tok.append(word['form']) # append sent pos to the the doc all_pos.append(sent_pos) all_head.append(sent_head) all_dep.append(sent_dep) all_tok.append(sent_tok) # for doc-level else: all_pos = [] all_head = [] all_dep = [] all_tok = [] for doc in text: pos_per_doc = [] head_per_doc = [] dep_per_doc = [] tok_per_doc = [] for line in doc: #print(line) sent_pos = [] sent_head = [] sent_dep = [] sent_tok = [] sents = list(m.process(line, out_format="conllu")) conllu = "".join(sents) parse_con = parse(conllu) # iterate over each word and append the POS/HEAD/UD into a list, #print(parse_con[0]) for i in range(len(parse_con)): for word in parse_con[i]: #print(i) sent_pos.append(word['upostag']) sent_head.append(word['head']) sent_dep.append(word['deprel']) sent_tok.append(word['form']) # append sent pos to the the doc pos_per_doc.append(sent_pos) head_per_doc.append(sent_head) dep_per_doc.append(sent_dep) tok_per_doc.append(sent_tok) all_pos.append(pos_per_doc) all_head.append(head_per_doc) all_dep.append(dep_per_doc) all_tok.append(tok_per_doc) return all_pos, all_head, all_dep, all_tok
def __init__(self, file = None, **kwarg): if file: self.model = Model(file) else: raise Exception("You should pass the model")
class UdpipeTrain(ITrain): def __init__(self, language_name, pre_model_name, our_corpus_name): """ The language of pre_model_name and our_corpus_name should be identical! :param language_name: :param pre_model_name: it's from udpipe :param our_corpus_name: it's our found """ self.language_name = language_name self.pre_model_name = pre_model_name self.our_corpus_name = our_corpus_name try: self.store_data = StoreData(db_config['user'], db_config['password'], db_host=db_config['db_host'], db_name=db_config['db_name']) self.cursor = self.store_data.db_connect().cursor() # second loading udpipe pre-train model self.model = Model(self.pre_model_name) except Exception as ex: print('logging in database error %s' % ex) def load_data(self) -> str: with open(self.our_corpus_name, 'r') as f: for sen in f: print('loading one sentence: %s' % (sen,)) yield sen print('loading done for our corpus') def clean_data(self, data: str) -> str: """ data is one or several sentence(s) we expect if data is \n, \t, empty str, etc, replace them :param data: raw data :return: data after cleaning """ cleaned_data = re.sub('[\n\t]+', '', data) return cleaned_data def do_train(self) -> List[TResult]: """ By pre-train modules of unpipe get the results for our corpus These udpipe modules can be download here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131 :return: """ # train our corpus to get POS for each word line_no = 1 for sen in self.load_data(): # if line_no < 1811: # line_no += 1 # continue sen_clean = self.clean_data(sen) if not sen_clean: continue word_pos = list(self.model.process(sen_clean)) # pprint(word_pos) for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) self.store_data.insert_data(self.cursor, results, self.language_name) print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name)) line_no += 1 print(' all written succeed for corpus of %s' % self.our_corpus_name) def extract_one_sentence(self, sentence) -> str: """ This private method is mainly used to extract the sentence text. an instance of udpipe Sentence: Sentence( comments=[ '# sent_id = 3', '# text = 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。'], words=[ Word(id=0, <root>), Word(id=1, form='黄土', lemma='黄土', xpostag='NNP', upostag='PROPN', head=3, deprel='nmod', misc='SpaceAfter=No'), Word(id=2, form='高原', lemma='高原', xpostag='NN', upostag='NOUN', head=3, deprel='nmod', misc='SpaceAfter=No'), Word(id=3, form='严寒', lemma='严寒', xpostag='NN', upostag='NOUN', head=22, deprel='nsubj', misc='SpaceAfter=No'), omited by myself ]) :param sentence: udpipe Sentence :return: str 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。 """ comment = ''.join(sentence.comments) try: cs = re.findall(r'text = (.*)', comment)[0] return cs except Exception as e: # TODO: need to write warning log print('error: not find a sentence', e) return '' def extract_one_word(self, sentence, sentence_text: str) -> [TResult]: """ This private method is mainly used to extract one word and it's POS :param sentence_text: :param sentence: :return: [TResult] """ r = [] for word in sentence.words: if word.lemma and word.lemma not in ITrain.FILTER_WORD: if word.lemma and word.upostag and sentence_text: r.append(TResult(word.lemma, word.upostag, sentence_text)) return r def word_segmentation(self, sentence) -> List[str]: """ :param sentence: :return: word list """ sen_clean = self.clean_data(sentence) if not sen_clean: return [] word_pos = list(self.model.process(sen_clean)) words = [] for i, one_sentence in enumerate(word_pos): sentence_text = self.extract_one_sentence(one_sentence) results = self.extract_one_word(one_sentence, sentence_text) words.extend([res.word for res in results]) return words
# for DE, EN, RU, LT from corpy.udpipe import Model from conllu import parse data = [] # turn sentences into a list with open('/Users/chenfish/Desktop/Thesis/Project/data/news_crawl/2m_de', 'rb') as f: for line in f.readlines(): data.append(str(line, 'utf-8')) model = 'german-hdt-ud-2.5-191206.udpipe' m = Model('../udpipe_model/' + model) print(model, "loaded successfully!") all_pos = [] for line in data: #print(line) sent_pos = [] sents = list(m.process(line, out_format="conllu")) conllu = "".join(sents) parse_con = parse(conllu) # iterate over each word and append the POS into a list,
def __init__(self, udpipe_model_path): self.udpipe_model = Model(udpipe_model_path)
from corpy.udpipe import Model from corpy.udpipe import pprint m = Model("/home/zglg/SLU/psd/pre-model/classical_chinese-kyoto-ud-2.5-191206.udpipe") sents = list(m.process("我爱北京天安门. 天安门上好风景")) pprint(sents)