Esempio n. 1
1
    def parse(self, text):
        # Lazy load model file to speed up startup
        if not self.model:
            self.model = self.load_model(self.language)

        text = text.strip()

        # Adding a period improves detection on especially short sentences
        period_added = False
        last_character = text.strip()[-1]
        if re.match(r"\w", last_character, flags=re.UNICODE):
            text += "."
            period_added = True

        pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                            Pipeline.DEFAULT, "conllu")
        error = ProcessingError()

        processed = pipeline.process(text, error)
        if error.occurred():
            raise ParserException(error.message)

        # Remove the period to make sure input corresponds to output
        if period_added:
            processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n"

        return processed
Esempio n. 2
0
    def process(self, text: 'str') -> 'Scene':
        """
        Processes the description and builds a scene based on it.

        Parameters
        ----------
        text : str
            The description of the scene.

        Returns
        -------
        Scene
            The scene described by the text.
        """
        text_preprocessed = self._preprocess(text)

        pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                            Pipeline.DEFAULT, "conllu")
        error = ProcessingError()
        processed = pipeline.process(text_preprocessed, error)
        parsed = conllu.parse(processed)

        scene = self._traverse_tree(parsed)

        return scene
Esempio n. 3
0
    def parse(self, text):
        # Lazy load model file to speed up startup
        if not self.model:
            self.model = self.load_model()

        text = text.strip()

        # Adding a period improves detection on especially short sentences
        period_added = False
        last_character = text.strip()[-1]
        if re.match(r"\w", last_character, flags=re.UNICODE):
            text += "."
            period_added = True

        pipeline = Pipeline(
            self.model,
            "tokenize",
            Pipeline.DEFAULT,
            Pipeline.DEFAULT,
            "conllu"
        )
        error = ProcessingError()

        processed = pipeline.process(text, error)
        if error.occurred():
            raise ParserException(error.message)

        # Remove the period to make sure input corresponds to output
        if period_added:
            processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n"

        return processed
Esempio n. 4
0
def udpipeS(pathmodel, sourcepath, pathdestination):
    model = Model.load(pathmodel)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel)
        sys.exit(1)
        sys.stderr.write('done\n')

    pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT,
                        Pipeline.DEFAULT, "conllu")
    error = ProcessingError()

    i = 1
    for filename in os.listdir(sourcepath):
        f = open(pathdestination + filename[:-3] + "conllu", "a")
        f.truncate(0)

        text = io.open(sourcepath + filename, "r", encoding="utf-8")
        string = "".join(text.readlines())
        # Process data
        processed = pipeline.process(string, error)
        if error.occurred():
            sys.stderr.write("An error occurred when running run_udpipe: ")
            sys.stderr.write(error.message)
            sys.stderr.write("\n")
            sys.exit(1)
        f.write(processed)
        f.close()

        print("File n ", i, " processed of ", len(os.listdir(sourcepath)))
        i += 1
Esempio n. 5
0
def tag_ud(text='Текст нужно передать функции в виде строки!',
           modelfile='udpipe_syntagrus.model'):
    model = Model.load(modelfile)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    processed = pipeline.process(
        text)  # обрабатываем текст, получаем результат в формате conllu
    output = [l for l in processed.split('\n') if not l.startswith('#')
              ]  # пропускаем строки со служебной информацией
    tagged = [
        w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w
    ]  # извлекаем из обработанного текста лемму и тэг
    # tagged_propn = []
    # propn  = []
    # for t in tagged:
    #	if t.endswith('PROPN'):
    #		if propn:
    #			propn.append(t)
    #		else:
    #			propn = [t]
    #	else:
    #		if len(propn) > 1:
    #			for x in propn:
    #				#name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
    #				tagged_propn.append(x)
    #		elif len(propn) == 1:
    #			tagged_propn.append(propn[0])
    #		tagged_propn.append(t)
    #		propn = []
    return tagged
Esempio n. 6
0
def udpipeG(pathmodel):
    path = "/home/guido/Progetto Unitexto/textdata/cleanedTxt/"
    model = Model.load(pathmodel)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel)
        sys.exit(1)
        sys.stderr.write('done\n')

    pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT,
                        Pipeline.DEFAULT, "conllu")
    error = ProcessingError()
    #   corp = io.open("/home/guido/Progetto Unitexto/textdata/corpus.txt","r",encoding= "utf-8")
    # Read whole input
    #  string="".join(corp.readlines())

    # Process data
    # processed = pipeline.process(string, error)

    f = open("/home/guido/Progetto Unitexto/textdata/corpus.conllu", "a")
    f.truncate(0)
    i = 1
    for filename in os.listdir(path):
        text = io.open(path + filename, "r", encoding="utf-8")
        string = "".join(text.readlines())
        # Process data
        processed = pipeline.process(string, error)
        if error.occurred():
            sys.stderr.write("An error occurred when running run_udpipe: ")
            sys.stderr.write(error.message)
            sys.stderr.write("\n")
            sys.exit(1)
        f.write(processed)

        print("File n ", i, " processed of ", len(os.listdir(path)))
        i += 1
Esempio n. 7
0
    def load_file(self, name, filename, lang):

        if filename not in tronco_special_files:
            filename_dir = os.path.join(root_path, "corpora", name, filename)
            if not lang in self.models:
                self.models[lang] = Model.load(
                    os.path.join(root_path, "udpipe",
                                 udpipe_models[lang]['path']))
            pipeline = Pipeline(self.models[lang], "tokenize",
                                Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
            with open(filename_dir) as f:
                try:
                    text = f.read().splitlines()
                except:
                    return False

            raw_text = []
            metadata = {'filename': filename}
            [
                metadata.update({
                    x.split(" = ", 1)[0].split("# ", 1)[1]:
                    x.split(" = ", 1)[1]
                }) if x.strip().startswith("# ") and " = " in x else
                raw_text.append(x) for x in text
            ]

            if not name in self.files:
                self.files[name] = {}
            self.files[name][filename] = pipeline.process(
                "\n".join(raw_text)).replace("# newdoc\n",
                                             "").replace("# newpar\n", "")
            if not name in self.metadata:
                self.metadata[name] = {}
            self.metadata[name][filename] = metadata
Esempio n. 8
0
def udpipe(sentences, model_name, verbose=False):
    """
    Parse text to Universal Dependencies using UDPipe.
    :param sentences: iterable of iterables of strings (one string per line)
    :param model_name: filename containing UDPipe model to load
    :param verbose: print extra information
    :return: iterable of lines containing parsed output
    """
    from ufal.udpipe import Model, Pipeline, ProcessingError
    model = Model.load(model_name)
    if not model:
        raise ValueError("Invalid model: '%s'" % model_name)
    pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT,
                        "conllu")
    lines1, lines2 = tee(l for s in sentences for l in s)
    text = "\n".join(lines1)
    error = ProcessingError()
    num_tokens = sum(1 for l in lines2 if l)
    with ioutil.external_write_mode():
        print("Running %s on %d tokens... " % (model_name, num_tokens),
              end="",
              flush=True)
    start = time()
    processed = pipeline.process(text, error)
    duration = time() - start
    with ioutil.external_write_mode():
        print("Done (%.3fs, %.0f tokens/s)" %
              (duration, num_tokens / duration if duration else 0))
        if verbose:
            print(processed)
    if error.occurred():
        raise RuntimeError(error.message)
    return processed.splitlines()
Esempio n. 9
0
def run(model_file, text_file):
    print('Loading model...')
    model = Model.load(model_file)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    print('Reading corpus...')
    with open(text_file) as f:
        text = f.read()

    print('Analyzing text...')
    processed = pipeline.process(text)

    print('Extracting phrases...')
    phrases = []
    sent = []
    for line in tqdm((processed + '#').splitlines()):
        if line.startswith('#') and len(sent):
            preps = get_preps(sent)
            for prep, dep_id in preps.values():
                pphrase = get_phrase(prep, dep_id, sent)
                phrases.append(pphrase)
            sent.clear()
        elif len(line) > 1:
            try:
                sent.append(Token(line.split('\t')))
            except ValueError:
                continue
    print('Done!')
    return phrases
Esempio n. 10
0
def tokenize_and_tag_texts(dict_texts):
    eng_model = Model.load('english-partut-ud-2.5-191206.udpipe')
    fr_model = Model.load('french-partut-ud-2.5-191206.udpipe')
    eng_pipeline = Pipeline(eng_model, 'generic_tokenizer', '', '', '')
    fr_pipeline = Pipeline(fr_model, 'generic_tokenizer', '', '', '')
    for language_key, primal_texts in dict_texts.items():
        tokenized_tagged_eng_text = eng_pipeline.process(primal_texts[1])
        tokenized_tagged_fr_text = fr_pipeline.process(primal_texts[2])
    dict_tokenized_tagged_texts = {
        'eng': tokenized_tagged_eng_text,
        'fr': tokenized_tagged_fr_text
    }
    # print(tokenized_tagged_eng_text)
    # print(tokenized_tagged_fr_text)
    # print(dict_tokenized_tagged_texts)
    return dict_tokenized_tagged_texts
Esempio n. 11
0
def make_conll_with_udpipe(text):
    model_path = os.path.join(os.getcwd(), 'udparsers',
                              'russian-syntagrus-ud-2.5-191206.udpipe'
                              )  # здесь указать путь к модели
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT,
                        Pipeline.DEFAULT, 'conllu')
    return pipeline.process(text)
Esempio n. 12
0
def make_conll_with_udpipe(text, language='german'):
    if language == 'german':
        model_path = path.join('..', '..', 'udpipe',
                               'german-ud-2.0-170801.udpipe')
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    return pipeline.process(text)
 def wordToInf(self, text):
     process_pipeline = Pipeline(self.modelForInfinitive, 'tokenize',
                                 Pipeline.DEFAULT, Pipeline.DEFAULT,
                                 'conllu')
     wordInfo = process_pipeline.process(text).split('\n')[4].split('\t')
     if (wordInfo[3] == 'NUM'):
         return ('_NUM_' + ('x' * len(wordInfo[2])))
     else:
         return wordInfo[2]
Esempio n. 14
0
class SyntaxParser(PreProcesser):
    
    def __init__(self, model_path):
        
        self.parser_model = Model.load(model_path)
        self.parser_pipeline = Pipeline(self.parser_model, 'conllu', Pipeline.NONE, Pipeline.DEFAULT, 'conllu')
    
    def transform_item(self, x):
        return self.parser_pipeline.process(x, ProcessingError())
    def get(modelAdd, text):

        from ufal.udpipe import Model, Pipeline, ProcessingError

        error = ProcessingError()
        model = Model.load(modelAdd)
        pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
        parsedArticle = pipeline.process(text, error)

        return parsedArticle
Esempio n. 16
0
    def run_udpipe(self, path_to_model, sents=None):
        if sents is None:
            sents = self.sents
        verticals = self._to_vertical(sents)
        model = Model.load(path_to_model)
        pipeline = Pipeline(model, "vertical", Pipeline.DEFAULT,
                            Pipeline.DEFAULT, "conllu")
        error = ProcessingError()
        conllu = pipeline.process(verticals, error)

        return conllu
Esempio n. 17
0
def extract_sentences(input_file: str, output_file: str, logger) -> None:

    logger.info(
        "==== Now performing sentence extraction from paragraphs file ====")
    # UDPipe initliazation
    lang_model = 'lang_models/czech-ud-2.0-170801.udpipe'
    model = Model.load(lang_model)
    if not model:
        logger.error('Could not load UDPipe language model: ' + lang_model)
    ud_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT,
                           Pipeline.DEFAULT, '')
    ud_error = ProcessingError()

    sentences_file = open(output_file, "w")
    # reopen paragraphs for reading
    paragraphs_file = open(input_file, "r")

    sentences_count = 0

    for p_line in paragraphs_file:
        page_first_sentence = ""
        page_first_paragraph = p_line.split(
            '\t', 1)  # use the variable as temporary list

        # If there is a paragraph content
        if len(page_first_paragraph) == 2:
            page_uri = page_first_paragraph[0]
            page_first_paragraph = page_first_paragraph[1]
            # Extract first sentence form paragraph using UDPipe:
            ud_output = ud_pipeline.process(page_first_paragraph, ud_error)
            if ud_error.occurred():
                logger.error(
                    'Error occured while extracting sentence using UDPipe: ' +
                    ud_error.message)
                page_first_sentence = ""
            else:
                ud_output = ud_output.split('\n')
                if len(ud_output) >= 4:
                    page_first_sentence = ud_output[3][
                        9:]  # assumption about the output format
                else:
                    page_first_sentence = ""

            # Write sentence to the file
            sentences_file.write(page_uri + '\t' + page_first_sentence + '\n')

            sentences_count += 1
            if sentences_count % 2000 == 0:
                logger.info("Extracted {} sentences.".format(sentences_count))

    logger.info("Finished extraction of {} sentences.".format(sentences_count))

    paragraphs_file.close()
    sentences_file.close()
Esempio n. 18
0
    def process_sentence(self, sen, field_names=None):
        pipeline = Pipeline(self._model, self._inp_format, self._pos_settings, self._parse_settings, 'conllu')
        error = ProcessingError()  # For catching errors...

        inp_sen = ''.join(self._encode_sentence(sen, field_names))
        # Do the processing... + Write the output in CoNLL-U
        processed = pipeline.process(inp_sen, error)

        if error.occurred():
            raise UDPipeError(error.message)

        ret_sen = self._decode_sentence(processed, sen, field_names)
        return ret_sen
Esempio n. 19
0
def parse(sentence):
    sys.argv.append('tokenize')
    sys.argv.append('conllu')
    sys.argv.append('russian-syntagrus-ud-2.4-190531.udpipe')
    model = Model.load(sys.argv[3])
    pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT,
                        sys.argv[2])
    error = ProcessingError()
    # small preproccessing step
    sentence = re.sub('«', '« ', sentence)
    sentence = re.sub('»', '» ', sentence)
    parsed = pipeline.process(sentence, error)
    print(parsed)
    return parsed
Esempio n. 20
0
 def tag_parse_tree(self, root):
     """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
     pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
     in_data = " ".join([n.form for n in root.descendants])
     out_data = pipeline.process(in_data, self.error)
     if self.error.occurred():
         raise IOError("UDPipe error " + self.error.message)
     self.conllu_reader.files.filehandle = io.StringIO(out_data)
     parsed_root = self.conllu_reader.read_tree()
     nodes = [root] + root.descendants
     for parsed_node in parsed_root.descendants:
         node = nodes[parsed_node.ord]
         node.parent = nodes[parsed_node.parent.ord]
         for attr in 'upos xpos lemma feats'.split():
             setattr(node, attr, getattr(parsed_node, attr))
class HabrPostagging:
    def __init__(self):
        # https://github.com/jwijffels/udpipe.models.ud.2.0/blob/master/inst/udpipe-ud-2.0-170801/russian-ud-2.0-170801.udpipe
        self.model = Model.load("russian-ud-2.0-170801.udpipe")
        self.pipeline = Pipeline(self.model, 'generic_tokenizer', '', '', '')
        self.reset_counter()

    def reset_counter(self):
        self.__pos_couter = {}

    def get_counter(self):
        return self.__pos_couter

    def __update_counter(self, pos):
        if pos in self.__pos_couter:
            self.__pos_couter[pos] += 1
        else:
            self.__pos_couter[pos] = 1

    def tag_file(self, input_file, output_file):
        text = pickle.load(open(input_file, 'rb'))['text']
        # Вообще-то питоновская обёртка udpipe сильно багованная, но на всех наших даннных отработала корректно.
        # Других парсеров, умеющих работать с русским, насколько мне известно, нет.
        parsed = self.pipeline.process(text)
        parsed = parse(parsed)
        with open(output_file, 'w', encoding='utf-8') as f:
            for sentence in parsed:
                for word in sentence:
                    self.__update_counter(word['upos'])
                    f.write('\t'.join([
                        word['form'], word['lemma'], word['upos'],
                        str(word['feats'])
                    ]) + '\n')
                f.write('\n')

    def tag_files(self, files, input_dir, output_dir, log=False):
        for filename in files:
            input_file = os.path.join(input_dir, filename)
            if os.path.isfile(input_file):
                if log:
                    print(input_file)
                output_file = os.path.join(output_dir, filename)
                output_file = os.path.splitext(output_file)[0] + '.tsv'
                self.tag_file(input_file, output_file)

    def tag_dir(self, input_dir, output_dir, log=False):
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        self.tag_files(os.listdir(input_dir), input_dir, output_dir, log)
Esempio n. 22
0
def process_text(path_to_file, input_format, model, out_file):
    """ Apply NLP processing to text (tokenize, tag, parse) and 
    save output in CONLLU format.
    """
    pipeline = Pipeline(model, input_format, Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')  
    error = ProcessingError()
    with codecs.open(path_to_file) as f:
        text = f.read()
        processed = pipeline.process(text, error)
        if error.occurred():
            print("Error when running UDPipe: ")
            print(error.message)
            print("\n")
            sys.exit(1)
        with codecs.open(out_file, 'w', 'utf-8') as of:
            of.write(processed)
Esempio n. 23
0
class PredPattArgumentExtractor(ArgumentExtractor):
    def __init__(
        self,
        path_to_udpipe: str,
        resolve_relcl: bool = True,
        resolve_appos: bool = True,
        resolve_amod: bool = True,
        resolve_conj: bool = True,
        resolve_poss: bool = True,
        ud=dep_v2.VERSION,
    ):
        super().__init__()
        self.model = Model.load(path_to_udpipe)
        self.pipeline = Pipeline(
            self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
        )
        self._error = ProcessingError()
        self._opts = PredPattOpts(
            resolve_relcl=resolve_relcl,
            resolve_appos=resolve_appos,
            resolve_amod=resolve_amod,
            resolve_conj=resolve_conj,
            resolve_poss=resolve_poss,
            ud=ud,
        )

    @lru_cache(maxsize=100000)
    def extract(self, sentence: str) -> List[Dict[str, Any]]:
        processed = self.pipeline.process(sentence, self._error)
        if self._error.occurred():
            print(f"=== Error occurred: {self._error.message}")
            self._error = ProcessingError()
            return None
        else:
            conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][
                0
            ]
            ppatt = PredPatt(conll_example, opts=self._opts)
            result = []
            for predicate in ppatt.instances:
                structure = {
                    "predicate": predicate.tokens,
                    "arguments": [x.tokens for x in predicate.arguments],
                }
                result.append(structure)

            return result
Esempio n. 24
0
def parse_udpipe(passages, model_name, verbose=False):
    from ufal.udpipe import Model, Pipeline, ProcessingError
    model = Model.load(model_name)
    if not model:
        raise ValueError("Invalid model: '%s'" % model_name)
    pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
    passages1, passages2 = tee(passages)
    text = "\n".join(l for p in passages1 for l in to_conllu(p, tree=True))
    error = ProcessingError()
    print("Running UDPipe on %d characters... " % len(text), end="", flush=True)
    start = time()
    processed = pipeline.process(text, error)
    print("Done (%.3fs)" % (time() - start))
    if verbose:
        print(processed)
    if error.occurred():
        raise RuntimeError(error.message)
    return zip(passages2, from_conllu(processed.splitlines(), passage_id=None))
Esempio n. 25
0
 def tag_parse_tree(self, root):
     """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
     descendants = root.descendants
     if not descendants:
         return
     pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
     in_data = " ".join([n.form for n in descendants])
     out_data = pipeline.process(in_data, self.error)
     if self.error.occurred():
         raise IOError("UDPipe error " + self.error.message)
     self.conllu_reader.files.filehandle = io.StringIO(out_data)
     parsed_root = self.conllu_reader.read_tree()
     nodes = [root] + descendants
     for parsed_node in parsed_root.descendants:
         node = nodes[parsed_node.ord]
         node.parent = nodes[parsed_node.parent.ord]
         for attr in 'upos xpos lemma feats'.split():
             setattr(node, attr, getattr(parsed_node, attr))
Esempio n. 26
0
class UDPipe:
    def __init__(self):
        print('Loading model: ')
        model_path = r"D:\py_projects\IWonnaBook\udpipe_syntagrus.model"
        self.model = Model.load(model_path)
        if not self.model:
            print('Модель не загружена :(')
            sys.exit(1)
        self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
        print('done\n')

    def get_sintax(self, text):
        error = ProcessingError()
        processed = self.pipeline.process(text, error)
        if error.occurred():
            print("An error occurred when running run_udpipe: ")
            print(error.message)
            print("\n")
        return processed
Esempio n. 27
0
def process_user_text_task(input_text=''):
    if input_text:
        from conllu import parse
        from ufal.udpipe import Model, Pipeline
        from error_search.process_text import process_text
        import os
        if not os.path.exists(
                'error_search/russian-syntagrus-ud-2.0-170801.udpipe'):
            boilerplate.fget_file(
                'upload/russian-syntagrus-ud-2.0-170801.udpipe',
                'error_search/russian-syntagrus-ud-2.0-170801.udpipe')
        ud_model = Model.load(
            'error_search/russian-syntagrus-ud-2.0-170801.udpipe')
        pipeline = Pipeline(ud_model, 'tokenize', Pipeline.DEFAULT,
                            Pipeline.DEFAULT, 'conllu')
        out = pipeline.process(input_text)
        tree = parse(out)

        return process_text(tree)
Esempio n. 28
0
def udpipe(conllu_in, model_path):

    model = Model.load(model_path)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3])
        sys.exit(1)

    pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT,
                        "conllu")
    error = ProcessingError()

    # Process data
    processed = pipeline.process(conllu_in, error)
    if error.occurred():
        sys.stderr.write("An error occurred when running run_udpipe: ")
        sys.stderr.write(error.message)
        sys.stderr.write("\n")
        sys.exit(1)

    return processed
Esempio n. 29
0
def main(args):
    model = Model.load(args.model)
    if not model:
        raise ValueError("Invalid model: '%s'" % args.model)
    os.makedirs(args.out_dir, exist_ok=True)
    pipeline = Pipeline(model, "tokenize" if args.txt else "conllu",
                        Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
    for pattern in args.filenames:
        for in_file in glob(pattern) or [pattern]:
            basename = os.path.basename(in_file)
            out_file = os.path.join(args.out_dir,
                                    os.path.splitext(basename)[0] + ".conllu")
            error = ProcessingError()
            with open(in_file, encoding="utf-8") as f:
                processed = pipeline.process(f.read(), error)
            if error.occurred():
                raise RuntimeError(error.message)
            with open(out_file, "w", encoding="utf-8") as f:
                f.write(processed)
            if not args.quiet:
                print("Wrote '%s'" % out_file)
Esempio n. 30
0
class SyntaxParser:
    def __init__(self, speller: Optional[Speller] = None):
        self.udpipe_model = Model.load(
            sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model')))
        self.process_pipeline = Pipeline(self.udpipe_model,
                                         sber_encode('tokenize'),
                                         Pipeline.DEFAULT, Pipeline.DEFAULT,
                                         sber_encode('conllu'))

        if speller is None:
            speller = Speller()
        self.speller: Speller = speller

    def get_syntax(self, text):
        processed = self.process_pipeline.process(sber_encode(text))
        content = [
            l for l in sber_decode(processed).split('\n')
            if not l.startswith('#')
        ]
        tagged = [w.split('\t') for w in content if w]
        return SyntaxTree(tagged, self.speller)
Esempio n. 31
0
class UDParser(object):

    models = {
        "en":
        "UniversalPetrarch/preprocessing/udpipe-1.2.0/model/english-ud-2.0-170801.udpipe",
        "es":
        "UniversalPetrarch/preprocessing/udpipe-1.2.0/model/spanish-ancora-ud-2.0-170801.udpipe",
        "ar": ""
    }

    pipeline = None
    error = ProcessingError()
    model = None

    def __init__(self, lang="en"):
        model_file = "/Users/sxs149331/PycharmProjects/UniversalPetrarch-master/" + self.models[
            lang]
        print model_file
        self.model = Model.load(model_file)
        if not self.model:
            sys.stderr.write("Model Loading Failed")
            sys.exit(1)
        sys.stderr.write('done\n')
        self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                                 Pipeline.DEFAULT, "conllu")

    def parse(self, text):
        #print self.pipeline
        processed = self.pipeline.process(text.strip(), self.error)
        if self.error.occurred():
            raise ValueError(self.error.message)

        lines = processed.split("\n")
        result = []
        for line in lines:
            if line.startswith("#"):
                continue
            result.append(line)

        return ("\n").join(result)
Esempio n. 32
0
class Solver(object):

    def __init__(self, seed=42):

        self.morph = pymorphy2.MorphAnalyzer()
        self.model = Model.load("data/udpipe_syntagrus.model".encode())
        self.process_pipeline = Pipeline(self.model, 'tokenize'.encode(), Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'.encode())
        self.seed = seed
        self.init_seed()
        self.paronyms = self.get_paronyms()
        self.freq_bigrams = self.open_freq_grams()

    def init_seed(self):
        return random.seed(self.seed)

    def open_freq_grams(selfself):
        with open('data/bigrams_lemmas.pickle', 'rb') as inputfile:
            counts = pickle.load(inputfile)
        return counts

    def get_paronyms(self):
        paronyms = []
        with open('data/paronyms.csv', 'r', encoding='utf-8') as in_file:
            for line in in_file.readlines():
                pair = line.strip(punctuation).split('\t')
                paronyms.append(pair)
        return paronyms

    def lemmatize(self, token):
        token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0]
        lemma = token_all.normal_form
        return lemma

    def find_closest_paronym(self, par):
        paronyms = set()
        for par1, par2 in self.paronyms:
            paronyms.add(par1)
            paronyms.add(par2)
        try:
            closest = get_close_matches(par, list(paronyms))[0]
        except IndexError:
            closest = None
        return closest

    def check_pair(self, token_norm):
        paronym = None
        for p1, p2 in self.paronyms:
            if token_norm == p1:
                paronym = p2
                break
            if token_norm == p2:
                paronym = p1
                break
        return paronym

    def find_paronyms(self, token):
        token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0]
        token_norm = token_all.normal_form
        paronym = self.check_pair(token_norm)

        if paronym is None:
            paronym_close = self.find_closest_paronym(token_norm)
            paronym = self.check_pair(paronym_close)

        if paronym is not None:
            paronym_parse = self.morph.parse(paronym)[0]
            try:
                str_grammar = str(token_all.tag).split()[1]
            except IndexError:
                str_grammar = str(token_all.tag)

            gr = set(str_grammar.replace("Qual ", "").replace(' ', ',').split(','))
            try:
                final_paronym = paronym_parse.inflect(gr).word
            except AttributeError:
                final_paronym = paronym
        else:
            final_paronym = ''
        return final_paronym

    def syntax_parse(self, some_text, token):
        processed = self.process_pipeline.process(some_text.lower().encode())
        content = [l for l in processed.decode().split('\n') if not l.startswith('#')]
        tagged = [w.split('\t') for w in content if w]

        linked_word = ''
        for analysis in tagged:
            if analysis[1] == token:
                head = analysis[6]
                if head == '0':
                    root_id = analysis[0]
                    for analysis in tagged:
                        if analysis[6] == root_id:
                            linked_word = analysis[1]
                            break
                else:
                    for analysis in tagged:
                        if analysis[0] == head:
                            linked_word = analysis[1]
                            break
        return linked_word

    def check_frequencies(self, sentences):
        examples = []
        for token, second_tok, line in sentences:
            token = token.lower().rstrip('.,;:!?')
            token_lemma = self.lemmatize(token)
            second_lemma = self.lemmatize(second_tok)

            collocation_word = self.syntax_parse(line, token)
            collocation_lemma = self.lemmatize(collocation_word)

            first = (collocation_lemma, token_lemma)
            second = (collocation_lemma, second_lemma)

            freq1 = self.freq_bigrams[first]
            freq2 = self.freq_bigrams[second]

            first = (token_lemma, collocation_lemma)
            second = (second_lemma, collocation_lemma)
            freq3 = self.freq_bigrams[first]
            freq4 = self.freq_bigrams[second]

            freq_first = freq1 + freq3
            freq_second = freq2 + freq4

            if freq_second > freq_first:
                return second_tok
            if freq_first == freq_second:
                examples.append((0, freq_first, freq_second, token, second_tok))

        good_paronym = ''
        if examples:
            good_paronym = examples[0][4]
        return good_paronym

    def predict(self, task):
        return self.predict_from_model(task)

    def fit(self, tasks):
        pass

    def load(self, path="data/models/solver5.pkl"):
        pass

    def save(self, path="data/models/solver5.pkl"):
        pass

    def predict_from_model(self, task):
        description = task["text"].replace('НЕВЕРНО ', "неверно ")
        sents = []
        for line in description.split("\n"):
            for token in line.split():
                if token.isupper() and len(token) > 2:  # get CAPS paronyms
                    second_pair = self.find_paronyms(token)
                    sents.append((token, second_pair, line))
        result = self.check_frequencies(sents)
        return result.strip(punctuation + '\n')