Example #1
1
    def parse(self, text):
        # Lazy load model file to speed up startup
        if not self.model:
            self.model = self.load_model(self.language)

        text = text.strip()

        # Adding a period improves detection on especially short sentences
        period_added = False
        last_character = text.strip()[-1]
        if re.match(r"\w", last_character, flags=re.UNICODE):
            text += "."
            period_added = True

        pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                            Pipeline.DEFAULT, "conllu")
        error = ProcessingError()

        processed = pipeline.process(text, error)
        if error.occurred():
            raise ParserException(error.message)

        # Remove the period to make sure input corresponds to output
        if period_added:
            processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n"

        return processed
Example #2
0
def run(model_file, text_file):
    print('Loading model...')
    model = Model.load(model_file)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    print('Reading corpus...')
    with open(text_file) as f:
        text = f.read()

    print('Analyzing text...')
    processed = pipeline.process(text)

    print('Extracting phrases...')
    phrases = []
    sent = []
    for line in tqdm((processed + '#').splitlines()):
        if line.startswith('#') and len(sent):
            preps = get_preps(sent)
            for prep, dep_id in preps.values():
                pphrase = get_phrase(prep, dep_id, sent)
                phrases.append(pphrase)
            sent.clear()
        elif len(line) > 1:
            try:
                sent.append(Token(line.split('\t')))
            except ValueError:
                continue
    print('Done!')
    return phrases
Example #3
0
def tag_ud(text='Текст нужно передать функции в виде строки!',
           modelfile='udpipe_syntagrus.model'):
    model = Model.load(modelfile)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    processed = pipeline.process(
        text)  # обрабатываем текст, получаем результат в формате conllu
    output = [l for l in processed.split('\n') if not l.startswith('#')
              ]  # пропускаем строки со служебной информацией
    tagged = [
        w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w
    ]  # извлекаем из обработанного текста лемму и тэг
    # tagged_propn = []
    # propn  = []
    # for t in tagged:
    #	if t.endswith('PROPN'):
    #		if propn:
    #			propn.append(t)
    #		else:
    #			propn = [t]
    #	else:
    #		if len(propn) > 1:
    #			for x in propn:
    #				#name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
    #				tagged_propn.append(x)
    #		elif len(propn) == 1:
    #			tagged_propn.append(propn[0])
    #		tagged_propn.append(t)
    #		propn = []
    return tagged
Example #4
0
def udpipeS(pathmodel, sourcepath, pathdestination):
    model = Model.load(pathmodel)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel)
        sys.exit(1)
        sys.stderr.write('done\n')

    pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT,
                        Pipeline.DEFAULT, "conllu")
    error = ProcessingError()

    i = 1
    for filename in os.listdir(sourcepath):
        f = open(pathdestination + filename[:-3] + "conllu", "a")
        f.truncate(0)

        text = io.open(sourcepath + filename, "r", encoding="utf-8")
        string = "".join(text.readlines())
        # Process data
        processed = pipeline.process(string, error)
        if error.occurred():
            sys.stderr.write("An error occurred when running run_udpipe: ")
            sys.stderr.write(error.message)
            sys.stderr.write("\n")
            sys.exit(1)
        f.write(processed)
        f.close()

        print("File n ", i, " processed of ", len(os.listdir(sourcepath)))
        i += 1
Example #5
0
def udpipeG(pathmodel):
    path = "/home/guido/Progetto Unitexto/textdata/cleanedTxt/"
    model = Model.load(pathmodel)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel)
        sys.exit(1)
        sys.stderr.write('done\n')

    pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT,
                        Pipeline.DEFAULT, "conllu")
    error = ProcessingError()
    #   corp = io.open("/home/guido/Progetto Unitexto/textdata/corpus.txt","r",encoding= "utf-8")
    # Read whole input
    #  string="".join(corp.readlines())

    # Process data
    # processed = pipeline.process(string, error)

    f = open("/home/guido/Progetto Unitexto/textdata/corpus.conllu", "a")
    f.truncate(0)
    i = 1
    for filename in os.listdir(path):
        text = io.open(path + filename, "r", encoding="utf-8")
        string = "".join(text.readlines())
        # Process data
        processed = pipeline.process(string, error)
        if error.occurred():
            sys.stderr.write("An error occurred when running run_udpipe: ")
            sys.stderr.write(error.message)
            sys.stderr.write("\n")
            sys.exit(1)
        f.write(processed)

        print("File n ", i, " processed of ", len(os.listdir(path)))
        i += 1
Example #6
0
    def __call__(self, *argv):
        """Performs tokenization, tagging, lemmatizing and parsing.

        Args:
            text(str): text. OR
            tokens(list): List of Token objects.
            sentences(list): List of Sentence objects.

        Returns:
            Dictionary that contains:
            1. tokens - list of objects Token.
            2. sentences - list of objects Sentence.
            3. lemma - list of lists of strings that represent lemmas of words.
            4. postag - list of lists of strings that represent postags of words.
            5. morph - list of lists of strings that represent morphological features.
            6. syntax_dep_tree - list of lists of objects WordSynt that represent a dependency tree.
        """
        assert self.model
        if type(argv[0]) == str:
            self.TOKENIZER = 'generic_tokenizer'
            self.pipeline = Pipeline(self.model, self.TOKENIZER, self.tagger,
                                     self.parser, 'conllu')

            return self.process_text(argv[0])

        self.TOKENIZER = 'horizontal'
        self.pipeline = Pipeline(self.model, self.TOKENIZER, self.tagger,
                                 self.parser, 'conllu')
        return self.process_tokenized(argv[0], argv[1])
Example #7
0
def udpipe(sentences, model_name, verbose=False):
    """
    Parse text to Universal Dependencies using UDPipe.
    :param sentences: iterable of iterables of strings (one string per line)
    :param model_name: filename containing UDPipe model to load
    :param verbose: print extra information
    :return: iterable of lines containing parsed output
    """
    from ufal.udpipe import Model, Pipeline, ProcessingError
    model = Model.load(model_name)
    if not model:
        raise ValueError("Invalid model: '%s'" % model_name)
    pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT,
                        "conllu")
    lines1, lines2 = tee(l for s in sentences for l in s)
    text = "\n".join(lines1)
    error = ProcessingError()
    num_tokens = sum(1 for l in lines2 if l)
    with ioutil.external_write_mode():
        print("Running %s on %d tokens... " % (model_name, num_tokens),
              end="",
              flush=True)
    start = time()
    processed = pipeline.process(text, error)
    duration = time() - start
    with ioutil.external_write_mode():
        print("Done (%.3fs, %.0f tokens/s)" %
              (duration, num_tokens / duration if duration else 0))
        if verbose:
            print(processed)
    if error.occurred():
        raise RuntimeError(error.message)
    return processed.splitlines()
Example #8
0
    def load_file(self, name, filename, lang):

        if filename not in tronco_special_files:
            filename_dir = os.path.join(root_path, "corpora", name, filename)
            if not lang in self.models:
                self.models[lang] = Model.load(
                    os.path.join(root_path, "udpipe",
                                 udpipe_models[lang]['path']))
            pipeline = Pipeline(self.models[lang], "tokenize",
                                Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
            with open(filename_dir) as f:
                try:
                    text = f.read().splitlines()
                except:
                    return False

            raw_text = []
            metadata = {'filename': filename}
            [
                metadata.update({
                    x.split(" = ", 1)[0].split("# ", 1)[1]:
                    x.split(" = ", 1)[1]
                }) if x.strip().startswith("# ") and " = " in x else
                raw_text.append(x) for x in text
            ]

            if not name in self.files:
                self.files[name] = {}
            self.files[name][filename] = pipeline.process(
                "\n".join(raw_text)).replace("# newdoc\n",
                                             "").replace("# newpar\n", "")
            if not name in self.metadata:
                self.metadata[name] = {}
            self.metadata[name][filename] = metadata
Example #9
0
    def parse(self, text):
        # Lazy load model file to speed up startup
        if not self.model:
            self.model = self.load_model()

        text = text.strip()

        # Adding a period improves detection on especially short sentences
        period_added = False
        last_character = text.strip()[-1]
        if re.match(r"\w", last_character, flags=re.UNICODE):
            text += "."
            period_added = True

        pipeline = Pipeline(
            self.model,
            "tokenize",
            Pipeline.DEFAULT,
            Pipeline.DEFAULT,
            "conllu"
        )
        error = ProcessingError()

        processed = pipeline.process(text, error)
        if error.occurred():
            raise ParserException(error.message)

        # Remove the period to make sure input corresponds to output
        if period_added:
            processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n"

        return processed
Example #10
0
    def process(self, text: 'str') -> 'Scene':
        """
        Processes the description and builds a scene based on it.

        Parameters
        ----------
        text : str
            The description of the scene.

        Returns
        -------
        Scene
            The scene described by the text.
        """
        text_preprocessed = self._preprocess(text)

        pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                            Pipeline.DEFAULT, "conllu")
        error = ProcessingError()
        processed = pipeline.process(text_preprocessed, error)
        parsed = conllu.parse(processed)

        scene = self._traverse_tree(parsed)

        return scene
Example #11
0
def make_conll_with_udpipe(text):
    model_path = os.path.join(os.getcwd(), 'udparsers',
                              'russian-syntagrus-ud-2.5-191206.udpipe'
                              )  # здесь указать путь к модели
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT,
                        Pipeline.DEFAULT, 'conllu')
    return pipeline.process(text)
Example #12
0
def make_conll_with_udpipe(text, language='german'):
    if language == 'german':
        model_path = path.join('..', '..', 'udpipe',
                               'german-ud-2.0-170801.udpipe')
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    return pipeline.process(text)
Example #13
0
 def __init__(self):
     print('Loading model: ')
     model_path = r"D:\py_projects\IWonnaBook\udpipe_syntagrus.model"
     self.model = Model.load(model_path)
     if not self.model:
         print('Модель не загружена :(')
         sys.exit(1)
     self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
     print('done\n')
 def wordToInf(self, text):
     process_pipeline = Pipeline(self.modelForInfinitive, 'tokenize',
                                 Pipeline.DEFAULT, Pipeline.DEFAULT,
                                 'conllu')
     wordInfo = process_pipeline.process(text).split('\n')[4].split('\t')
     if (wordInfo[3] == 'NUM'):
         return ('_NUM_' + ('x' * len(wordInfo[2])))
     else:
         return wordInfo[2]
Example #15
0
    def __init__(self, seed=42):

        self.morph = pymorphy2.MorphAnalyzer()
        self.model = Model.load("data/udpipe_syntagrus.model".encode())
        self.process_pipeline = Pipeline(self.model, 'tokenize'.encode(), Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'.encode())
        self.seed = seed
        self.init_seed()
        self.paronyms = self.get_paronyms()
        self.freq_bigrams = self.open_freq_grams()
    def get(modelAdd, text):

        from ufal.udpipe import Model, Pipeline, ProcessingError

        error = ProcessingError()
        model = Model.load(modelAdd)
        pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
        parsedArticle = pipeline.process(text, error)

        return parsedArticle
Example #17
0
 def __init__(self, lang="en"):
     model_file = "/Users/sxs149331/PycharmProjects/UniversalPetrarch-master/" + self.models[
         lang]
     print model_file
     self.model = Model.load(model_file)
     if not self.model:
         sys.stderr.write("Model Loading Failed")
         sys.exit(1)
     sys.stderr.write('done\n')
     self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                              Pipeline.DEFAULT, "conllu")
Example #18
0
def extract_sentences(input_file: str, output_file: str, logger) -> None:

    logger.info(
        "==== Now performing sentence extraction from paragraphs file ====")
    # UDPipe initliazation
    lang_model = 'lang_models/czech-ud-2.0-170801.udpipe'
    model = Model.load(lang_model)
    if not model:
        logger.error('Could not load UDPipe language model: ' + lang_model)
    ud_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT,
                           Pipeline.DEFAULT, '')
    ud_error = ProcessingError()

    sentences_file = open(output_file, "w")
    # reopen paragraphs for reading
    paragraphs_file = open(input_file, "r")

    sentences_count = 0

    for p_line in paragraphs_file:
        page_first_sentence = ""
        page_first_paragraph = p_line.split(
            '\t', 1)  # use the variable as temporary list

        # If there is a paragraph content
        if len(page_first_paragraph) == 2:
            page_uri = page_first_paragraph[0]
            page_first_paragraph = page_first_paragraph[1]
            # Extract first sentence form paragraph using UDPipe:
            ud_output = ud_pipeline.process(page_first_paragraph, ud_error)
            if ud_error.occurred():
                logger.error(
                    'Error occured while extracting sentence using UDPipe: ' +
                    ud_error.message)
                page_first_sentence = ""
            else:
                ud_output = ud_output.split('\n')
                if len(ud_output) >= 4:
                    page_first_sentence = ud_output[3][
                        9:]  # assumption about the output format
                else:
                    page_first_sentence = ""

            # Write sentence to the file
            sentences_file.write(page_uri + '\t' + page_first_sentence + '\n')

            sentences_count += 1
            if sentences_count % 2000 == 0:
                logger.info("Extracted {} sentences.".format(sentences_count))

    logger.info("Finished extraction of {} sentences.".format(sentences_count))

    paragraphs_file.close()
    sentences_file.close()
Example #19
0
 def load_udpipe(self, filename):
     if not can_udpipe:
         print("importing udpipe failed, cannot load udpipe xxx")
         return
     self.udpiper = Model.load(filename)
     # use pipeline for now, ugly but workable
     self.udpipeline = Pipeline(self.udpiper, 'horizontal',
                                Pipeline.DEFAULT, Pipeline.DEFAULT,
                                'conllu')
     self.uderror = ProcessingError()
     self.can_udpipe = True
Example #20
0
 def __init__(self, udpipe_path, keep_pos=1, keep_punct=0, keep_stops=1):
     self.stop_pos = {
         'AUX', 'NUM', 'DET', 'PRON', 'ADP', 'SCONJ', 'CCONJ', 'INTJ',
         'PART', 'X'
     }
     self.udpipe_model = Model.load(udpipe_path)
     self.pipeline = Pipeline(self.udpipe_model, 'tokenize',
                              Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
     self.keep_pos = keep_pos
     self.keep_punct = keep_punct
     self.keep_stops = keep_stops
Example #21
0
    def __init__(self, speller: Optional[Speller] = None):
        self.udpipe_model = Model.load(
            sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model')))
        self.process_pipeline = Pipeline(self.udpipe_model,
                                         sber_encode('tokenize'),
                                         Pipeline.DEFAULT, Pipeline.DEFAULT,
                                         sber_encode('conllu'))

        if speller is None:
            speller = Speller()
        self.speller: Speller = speller
Example #22
0
    def run_udpipe(self, path_to_model, sents=None):
        if sents is None:
            sents = self.sents
        verticals = self._to_vertical(sents)
        model = Model.load(path_to_model)
        pipeline = Pipeline(model, "vertical", Pipeline.DEFAULT,
                            Pipeline.DEFAULT, "conllu")
        error = ProcessingError()
        conllu = pipeline.process(verticals, error)

        return conllu
Example #23
0
    def __init__(self, seed=42):

        self.morph = pymorphy2.MorphAnalyzer()
        self.model = Model.load(
            sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model')))
        self.process_pipeline = Pipeline(self.model, sber_encode('tokenize'),
                                         Pipeline.DEFAULT, Pipeline.DEFAULT,
                                         sber_encode('conllu'))
        self.seed = seed
        self.init_seed()
        self.paronyms = self.get_paronyms()
        self.freq_bigrams = self.open_freq_grams()
Example #24
0
    def __init__(self):

        ud_model_path = '../models/udpipe_syntagrus.model'
        self.ud_model = Model.load(ud_model_path)

        w2v_model_path = '../models/ruscorpora_upos_skipgram_300_5_2018.vec.gz'
        self.w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
            w2v_model_path)

        self.process_pipeline = Pipeline(self.ud_model, 'tokenize',
                                         Pipeline.DEFAULT, Pipeline.DEFAULT,
                                         'conllu')
    def __init__(self, modeldir):

        self.modeldir = os.path.abspath(modeldir)
        self.model = Model.load(self.modeldir)
        self.error = ProcessingError()

        if not self.model:
            sys.stderr.write("Udpipe language model loading failed:" +
                             self.modeldir)
            sys.exit(1)

        self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT,
                                 Pipeline.DEFAULT, "conllu")
Example #26
0
 def __init__(self, udmodel, wordModel):
     self.__udmodel__ = Model.load(udmodel)
     if self.__udmodel__ is None:
         raise ValueError('Unknown UDPipe model')
     self.__pipeline__ = Pipeline(
         self.__udmodel__,
         'horizontal',
         Pipeline.DEFAULT,
         Pipeline.DEFAULT,
         'conllu')
     self.__uderror__ = ProcessingError()
     self.__srem__ = WordModel(wordModel)
     self.__result__ = Result()
Example #27
0
    def process_sentence(self, sen, field_names=None):
        pipeline = Pipeline(self._model, self._inp_format, self._pos_settings, self._parse_settings, 'conllu')
        error = ProcessingError()  # For catching errors...

        inp_sen = ''.join(self._encode_sentence(sen, field_names))
        # Do the processing... + Write the output in CoNLL-U
        processed = pipeline.process(inp_sen, error)

        if error.occurred():
            raise UDPipeError(error.message)

        ret_sen = self._decode_sentence(processed, sen, field_names)
        return ret_sen
Example #28
0
def parse(sentence):
    sys.argv.append('tokenize')
    sys.argv.append('conllu')
    sys.argv.append('russian-syntagrus-ud-2.4-190531.udpipe')
    model = Model.load(sys.argv[3])
    pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT,
                        sys.argv[2])
    error = ProcessingError()
    # small preproccessing step
    sentence = re.sub('«', '« ', sentence)
    sentence = re.sub('»', '» ', sentence)
    parsed = pipeline.process(sentence, error)
    print(parsed)
    return parsed
Example #29
0
 def tag_parse_tree(self, root):
     """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
     pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
     in_data = " ".join([n.form for n in root.descendants])
     out_data = pipeline.process(in_data, self.error)
     if self.error.occurred():
         raise IOError("UDPipe error " + self.error.message)
     self.conllu_reader.files.filehandle = io.StringIO(out_data)
     parsed_root = self.conllu_reader.read_tree()
     nodes = [root] + root.descendants
     for parsed_node in parsed_root.descendants:
         node = nodes[parsed_node.ord]
         node.parent = nodes[parsed_node.parent.ord]
         for attr in 'upos xpos lemma feats'.split():
             setattr(node, attr, getattr(parsed_node, attr))
def load_nli_data(path, snli=False, udpipe_path=None, seq_length=50, r=10, cache_file=''):
    """
    Load MultiNLI or SNLI data.
    If the 'snli' parameter is set to True, a genre label of snli will be assigned to the data. 
    """
    global is_snli, pipeline, error, pr_seq_length, pr_r
    is_snli = snli
    pr_r = r
    pr_seq_length = seq_length
    pipeline = None
    print(path)
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            data = [w for w in pickle.load(f) if w is not None]
    else:
        if udpipe_path:
            model = Model.load(udpipe_path)
            pipeline = Pipeline(model, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
            error = ProcessingError()

        with open(path) as f:
            pool = Pool(32)
            data = pool.map_async(process_line, list(f), chunksize=1)
            while not data.ready():
                print('{} lines left'.format(data._number_left))
                time.sleep(10)
            data = [w for w in data.get() if w is not None]
            pool.close()
            random.seed(1)
            random.shuffle(data)

        with open(cache_file, 'wb') as f:
            pickle.dump(data, f)

    return data
Example #31
0
 def tag_parse_tree(self, root):
     """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
     descendants = root.descendants
     if not descendants:
         return
     pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
     in_data = " ".join([n.form for n in descendants])
     out_data = pipeline.process(in_data, self.error)
     if self.error.occurred():
         raise IOError("UDPipe error " + self.error.message)
     self.conllu_reader.files.filehandle = io.StringIO(out_data)
     parsed_root = self.conllu_reader.read_tree()
     nodes = [root] + descendants
     for parsed_node in parsed_root.descendants:
         node = nodes[parsed_node.ord]
         node.parent = nodes[parsed_node.parent.ord]
         for attr in 'upos xpos lemma feats'.split():
             setattr(node, attr, getattr(parsed_node, attr))
Example #32
-1
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d','--from_dir_prefix', type = str)
    parser.add_argument('-t','--to_dir_prefix', default = '/tmp/workers', type = str)
    parser.add_argument('-v','--to_vocab_file', default = '/tmp/workers/vocab.txt', type = str)
    parser.add_argument('-u','--path2udp_model', default = './russian-syntagrus-ud-2.0-170801.udpipe', type = str)
    parser.add_argument('-n','--cpu_n', default = 5, type = int)
    parser.add_argument('-T','--timeout_duration', default = 40*60, type = int) # timeout_duration = 40 min
    args = parser.parse_args()

    os.makedirs(args.to_dir_prefix, exist_ok=True)

    model = Model.load(args.path2udp_model)
    udpipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'horizontal')

    files = list(glob(args.from_dir_prefix))

    word_counts = multipd.timeouted_run_pool(files,udpipeline,args.to_dir_prefix, cpu_n=args.cpu_n, timeout_duration=args.timeout_duration)
    word_count = sum(word_counts, collections.Counter())

    vocab = [ '%s\n' % word for word, _ in word_count.most_common()]
    vocab.append(word_count.most_common()[-1][0])
    open(args.to_vocab_file, 'wt').writelines(vocab)

    freqs = [ '{}\t{}\n'.format(word, freq) for word, freq in word_count.most_common()[:-1]]
    open(args.to_vocab_file+'.freqs', 'wt').writelines(freqs)