Ejemplo n.º 1
0
 def load_model(self):
     extension = "-ud-{0}.udpipe".format(UD_VERSION)
     udpipe_model = udpipe_models[self.language]+extension
     model_file = os.path.join("models", udpipe_model)
     if not os.path.exists(model_file):
         url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/{0}".format(udpipe_model)
         print (url)
         wget.download(url, out="models")
     return Model(model_file)
Ejemplo n.º 2
0
class UDPipeTokenizer:
    def __init__(self, udpipe_model_path):
        self.udpipe_model = Model(udpipe_model_path)

    def tokenize(self, sentence: str) -> List[Tuple[str, str]]:
        """
        return: list of pairs of tags (POS, DEP_REL) for each token in the sentence
        """
        s = list(self.udpipe_model.process(sentence))
        lst = [(item.upostag, item.deprel) for item in s[0].words
               if item.upostag != '<root>']
        return lst
Ejemplo n.º 3
0
 def __init__(self, language_name, pre_model_name, our_corpus_name):
     """
     The language of pre_model_name and our_corpus_name should be identical!
     :param language_name:
     :param pre_model_name: it's from udpipe
     :param our_corpus_name: it's our found
     """
     self.language_name = language_name
     self.pre_model_name = pre_model_name
     self.our_corpus_name = our_corpus_name
     try:
         self.store_data = StoreData(db_config['user'],
                                     db_config['password'],
                                     db_host=db_config['db_host'],
                                     db_name=db_config['db_name'])
         self.cursor = self.store_data.db_connect().cursor()
         # second loading udpipe pre-train model
         self.model = Model(self.pre_model_name)
         self._word_count, self.MAX_WORD_COUNT = 0, 500000
     except Exception as ex:
         print('logging in database error %s' % ex)
Ejemplo n.º 4
0
 def do_train(self) -> List[TResult]:
     """
     By pre-train modules of unpipe get the results for our corpus
     These udpipe modules can be download here:
     https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
     :return:
     """
     model = Model(self.pre_model_name)
     # train our corpus to get POS for each word
     line_no = 1
     for sen in self.load_data():
         sen_clean = self.clean_data(sen)
         if not sen_clean:
             continue
         word_pos = list(model.process(sen_clean))
         for i, one_sentence in enumerate(word_pos):
             sentence_text = self.extract_one_sentence(one_sentence)
             results = self.extract_one_word(one_sentence, sentence_text)
             self.store_data.insert_data(self.cursor, results, self.language_name)
             print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
         line_no += 1
     print(' all written succeed for corpus of %s' % self.our_corpus_name)
Ejemplo n.º 5
0
class UdpipeTagger:
    def __init__(self, file = None, **kwarg):
        if file:
            self.model = Model(file)
        else:
            raise Exception("You should pass the model")
        
    def get_pos_tag(self, word):
        sent = list(self.model.process(word))[0]
        
        if len(sent.words) != 2:
            print(word, sent.words)
        
        return sent.words[1].xpostag
Ejemplo n.º 6
0
 def _check_tokenizer(self, lang):
     if lang not in self.models:
         model_path = os.path.join(FILE_PATH, "udpipe", "models",
                                   self._lang2tokenizer_modelname(lang))
         self.models[lang] = Model(model_path)
Ejemplo n.º 7
0
 def _check_model(self, lang):
     """ Check if model exists, is loaded, and load it if needed. """
     if lang not in self.models:
         model_path = os.path.join(FILE_PATH, 'udpipe', 'models',
                                   self._lang2modelname(lang))
         self.models[lang] = Model(model_path)
Ejemplo n.º 8
0
from text_to_data import Doc2Data
from text_to_data import CalculatePair
from model import train_model, get_data
from progress.bar import Bar
import os
from corpy.udpipe import Model
import stopwords
from random import randint
import numpy as np

m = Model("russian-syntagrus-ud-2.5-191206.udpipe")
stop = stopwords.get_stopwords('ru')
# postfix can be _sm for demo corpus and _med for second part, _all for all corpus
postfix = '_all'

text_folder_name = 'texts' + postfix + '/'
data_folder_name = 'data' + postfix + '/'


def make_data_from_texts():
    # goes through folder and process all texts in json
    all_texts = os.listdir(text_folder_name)
    for text in Bar(' text parsing...').iter(all_texts):
        Doc2Data(text_folder_name + text, m, stop, data_folder_name)


def make_pairs(authors):
    all_texts = os.listdir(text_folder_name)
    texts = open('db' + postfix + '.csv', 'r').read().split('\n')[:authors]
    text = []
    for i in texts:
Ejemplo n.º 9
0
def corpy_udpipe(text,
                 sent_level=True,
                 model='english-lines-ud-2.5-191206.udpipe'):

    m = Model('../udpipe_model/' + model)
    print(model, "loaded successfully!")

    if sent_level:

        all_pos = []
        all_head = []
        all_dep = []
        all_tok = []

        for line in text:
            #print(line)
            sent_pos = []
            sent_head = []
            sent_dep = []
            sent_tok = []

            sents = list(m.process(line, out_format="conllu"))

            conllu = "".join(sents)
            parse_con = parse(conllu)

            # iterate over each word and append the POS/HEAD/UD into a list,

            #print(parse_con[0])

            for i in range(len(parse_con)):
                for word in parse_con[i]:
                    #print(i)
                    sent_pos.append(word['upostag'])
                    sent_head.append(word['head'])
                    sent_dep.append(word['deprel'])
                    sent_tok.append(word['form'])

            # append sent pos to the the doc
            all_pos.append(sent_pos)
            all_head.append(sent_head)
            all_dep.append(sent_dep)
            all_tok.append(sent_tok)

    # for doc-level
    else:

        all_pos = []
        all_head = []
        all_dep = []
        all_tok = []

        for doc in text:

            pos_per_doc = []
            head_per_doc = []
            dep_per_doc = []
            tok_per_doc = []

            for line in doc:
                #print(line)
                sent_pos = []
                sent_head = []
                sent_dep = []
                sent_tok = []

                sents = list(m.process(line, out_format="conllu"))
                conllu = "".join(sents)
                parse_con = parse(conllu)

                # iterate over each word and append the POS/HEAD/UD into a list,

                #print(parse_con[0])

                for i in range(len(parse_con)):
                    for word in parse_con[i]:
                        #print(i)
                        sent_pos.append(word['upostag'])
                        sent_head.append(word['head'])
                        sent_dep.append(word['deprel'])
                        sent_tok.append(word['form'])

                # append sent pos to the the doc

                pos_per_doc.append(sent_pos)
                head_per_doc.append(sent_head)
                dep_per_doc.append(sent_dep)
                tok_per_doc.append(sent_tok)

            all_pos.append(pos_per_doc)
            all_head.append(head_per_doc)
            all_dep.append(dep_per_doc)
            all_tok.append(tok_per_doc)

    return all_pos, all_head, all_dep, all_tok
Ejemplo n.º 10
0
 def __init__(self, file = None, **kwarg):
     if file:
         self.model = Model(file)
     else:
         raise Exception("You should pass the model")
Ejemplo n.º 11
0
class UdpipeTrain(ITrain):
    def __init__(self, language_name, pre_model_name, our_corpus_name):
        """

        The language of pre_model_name and our_corpus_name should be identical!
        :param language_name:
        :param pre_model_name: it's from udpipe
        :param our_corpus_name: it's our found
        """
        self.language_name = language_name
        self.pre_model_name = pre_model_name
        self.our_corpus_name = our_corpus_name
        try:
            self.store_data = StoreData(db_config['user'],
                                        db_config['password'],
                                        db_host=db_config['db_host'],
                                        db_name=db_config['db_name'])
            self.cursor = self.store_data.db_connect().cursor()

            # second loading udpipe pre-train model
            self.model = Model(self.pre_model_name)

        except Exception as ex:
            print('logging in database error %s' % ex)

    def load_data(self) -> str:
        with open(self.our_corpus_name, 'r') as f:
            for sen in f:
                print('loading one sentence: %s' % (sen,))
                yield sen

        print('loading done for our corpus')

    def clean_data(self, data: str) -> str:
        """
        data is one or several sentence(s) we expect

        if data is \n, \t, empty str, etc, replace them

        :param data: raw data
        :return: data after cleaning
        """
        cleaned_data = re.sub('[\n\t]+', '', data)
        return cleaned_data

    def do_train(self) -> List[TResult]:
        """
        By pre-train modules of unpipe get the results for our corpus
        These udpipe modules can be download here:
        https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
        :return:
        """
        # train our corpus to get POS for each word
        line_no = 1
        for sen in self.load_data():
            # if line_no < 1811:
            #     line_no += 1
            #     continue
            sen_clean = self.clean_data(sen)
            if not sen_clean:
                continue
            word_pos = list(self.model.process(sen_clean))
            # pprint(word_pos)
            for i, one_sentence in enumerate(word_pos):
                sentence_text = self.extract_one_sentence(one_sentence)
                results = self.extract_one_word(one_sentence, sentence_text)
                self.store_data.insert_data(self.cursor, results, self.language_name)
                print('line %d, batch %d for %s written succeed' % (line_no, i, self.language_name))
            line_no += 1
        print(' all written succeed for corpus of %s' % self.our_corpus_name)

    def extract_one_sentence(self, sentence) -> str:
        """
       This private method is mainly used to extract the sentence text.
       an instance of udpipe Sentence:
       Sentence(
           comments=[
             '# sent_id = 3',
             '# text = 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。'],
           words=[
             Word(id=0, <root>),
             Word(id=1,
                  form='黄土',
                  lemma='黄土',
                  xpostag='NNP',
                  upostag='PROPN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=2,
                  form='高原',
                  lemma='高原',
                  xpostag='NN',
                  upostag='NOUN',
                  head=3,
                  deprel='nmod',
                  misc='SpaceAfter=No'),
             Word(id=3,
                  form='严寒',
                  lemma='严寒',
                  xpostag='NN',
                  upostag='NOUN',
                  head=22,
                  deprel='nsubj',
                  misc='SpaceAfter=No'),
             
             omited by myself ])
       
       :param sentence: udpipe Sentence
       :return: str 黄土高原严寒而漫长的冬天看来就要过去,但那真正温暖的春天还远远地没有到来。
       """
        comment = ''.join(sentence.comments)
        try:
            cs = re.findall(r'text = (.*)', comment)[0]
            return cs
        except Exception as e:
            # TODO: need to write warning log
            print('error: not find a sentence', e)
            return ''

    def extract_one_word(self, sentence, sentence_text: str) -> [TResult]:
        """
        This private method is mainly used to extract one word and it's POS

        :param sentence_text:
        :param sentence:
        :return: [TResult]
        """
        r = []
        for word in sentence.words:
            if word.lemma and word.lemma not in ITrain.FILTER_WORD:
                if word.lemma and word.upostag and sentence_text:
                    r.append(TResult(word.lemma, word.upostag, sentence_text))
        return r

    def word_segmentation(self, sentence) -> List[str]:
        """
        :param sentence:
        :return: word list
        """
        sen_clean = self.clean_data(sentence)
        if not sen_clean:
            return []
        word_pos = list(self.model.process(sen_clean))
        words = []
        for i, one_sentence in enumerate(word_pos):
            sentence_text = self.extract_one_sentence(one_sentence)
            results = self.extract_one_word(one_sentence, sentence_text)
            words.extend([res.word for res in results])
        return words
# for DE, EN, RU, LT

from corpy.udpipe import Model
from conllu import parse

data = []

# turn sentences into a list
with open('/Users/chenfish/Desktop/Thesis/Project/data/news_crawl/2m_de',
          'rb') as f:
    for line in f.readlines():
        data.append(str(line, 'utf-8'))

model = 'german-hdt-ud-2.5-191206.udpipe'

m = Model('../udpipe_model/' + model)
print(model, "loaded successfully!")

all_pos = []

for line in data:
    #print(line)
    sent_pos = []

    sents = list(m.process(line, out_format="conllu"))

    conllu = "".join(sents)
    parse_con = parse(conllu)

    # iterate over each word and append the POS into a list,
Ejemplo n.º 13
0
 def __init__(self, udpipe_model_path):
     self.udpipe_model = Model(udpipe_model_path)
Ejemplo n.º 14
0
from corpy.udpipe import Model
from corpy.udpipe import pprint


m = Model("/home/zglg/SLU/psd/pre-model/classical_chinese-kyoto-ud-2.5-191206.udpipe")

sents = list(m.process("我爱北京天安门. 天安门上好风景"))
pprint(sents)