Ejemplo n.º 1
0
    def get_udpipe_model(lang):
        """
            Static ref to a UDPipe model.
            The actual models are in the data directory.
        """
        from ufal.udpipe import Model, Pipeline
        res_path = Resources.get_resources_dir()

        if lang == "en":
            if Resources.en_model is None:
                Resources.en_model = Model.load(os.path.join(res_path, "english-ud-2.1-20180111.udpipe"))
                if Resources.en_model is None:
                    raise Exception("Failed to load the English UDPipe model.")
            return Resources.en_model
        elif lang == "de":
            if Resources.en_model is None:
                Resources.en_model = Model.load(os.path.join(res_path, "german-ud-2.0-170801.udpipe"))
                if Resources.en_model is None:
                    raise Exception("Failed to load the German UDPipe model.")
            return Resources.en_model
        elif lang == "fr":
            if Resources.fr_model is None:
                Resources.fr_model = Model.load(os.path.join(res_path, "french-sequoia-ud-2.1-20180111.udpipe"))
                if Resources.fr_model is None:
                    raise Exception("Failed to load the French UDPipe model.")
            return Resources.fr_model

        elif lang == "nl":
            if Resources.nl_model is None:
                Resources.nl_model = Model.load(os.path.join(res_path, "dutch-ud-2.1-20180111.udpipe"))
                if Resources.nl_model is None:
                    raise Exception("Failed to load the Dutch UDPipe model.")
            return Resources.nl_model
        else:
            raise Exception(f"Language '{lang}' is not supported.")
Ejemplo n.º 2
0
    def __init__(self, task='parse', model=os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                                        'hungarian-szeged-ud-2.5-191206.udpipe'),
                 source_fields=None, target_fields=None):
        # Download model:
        #   - UDv1.2 https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1659
        #   - UDv2.4 https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2898
        #   - UDv2.5 https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131
        self._model = Model.load(model)  # Load this
        if self._model is None:
            raise UDPipeError('ERROR:Loading modelfile {0}'.format(model))
        available_tasks = {'tok': self._setup_tok, 'pos': self._setup_pos, 'parse': self._setup_parse,
                           'tok-pos': self._setup_tok_pos, 'tok-parse': self._setup_tok_parse,
                           'pos-parse': self._setup_pos_parse}

        for keyword, key_fun in available_tasks.items():
            if task == keyword:
                key_fun()  # Do setup!
                self._task = task  # Store for later
                break
        else:
            raise ValueError('No proper task is specified. The available tasks are {0}'.
                             format(' or '.join(available_tasks.keys())))

        # Field names for xtsv (the code below is mandatory for an xtsv module)
        if source_fields is None:
            source_fields = set()

        if target_fields is None:
            target_fields = []

        self.source_fields = source_fields
        self.target_fields = target_fields
Ejemplo n.º 3
0
def tokenize_and_tag_texts(dict_texts):
    eng_model = Model.load('english-partut-ud-2.5-191206.udpipe')
    fr_model = Model.load('french-partut-ud-2.5-191206.udpipe')
    eng_pipeline = Pipeline(eng_model, 'generic_tokenizer', '', '', '')
    fr_pipeline = Pipeline(fr_model, 'generic_tokenizer', '', '', '')
    for language_key, primal_texts in dict_texts.items():
        tokenized_tagged_eng_text = eng_pipeline.process(primal_texts[1])
        tokenized_tagged_fr_text = fr_pipeline.process(primal_texts[2])
    dict_tokenized_tagged_texts = {
        'eng': tokenized_tagged_eng_text,
        'fr': tokenized_tagged_fr_text
    }
    # print(tokenized_tagged_eng_text)
    # print(tokenized_tagged_fr_text)
    # print(dict_tokenized_tagged_texts)
    return dict_tokenized_tagged_texts
Ejemplo n.º 4
0
def run(model_file, text_file):
    print('Loading model...')
    model = Model.load(model_file)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    print('Reading corpus...')
    with open(text_file) as f:
        text = f.read()

    print('Analyzing text...')
    processed = pipeline.process(text)

    print('Extracting phrases...')
    phrases = []
    sent = []
    for line in tqdm((processed + '#').splitlines()):
        if line.startswith('#') and len(sent):
            preps = get_preps(sent)
            for prep, dep_id in preps.values():
                pphrase = get_phrase(prep, dep_id, sent)
                phrases.append(pphrase)
            sent.clear()
        elif len(line) > 1:
            try:
                sent.append(Token(line.split('\t')))
            except ValueError:
                continue
    print('Done!')
    return phrases
Ejemplo n.º 5
0
    def _tokenize(self, text='Текст нужно передать функции в виде строки!'):
        from utils import lemmatize

        if not self.udpipe_model:
            udpipe_model_path = os.path.join(BASE_DIR, 'model',
                                             'udpipe_syntagrus.model')

            if not os.path.isfile(udpipe_model_path):
                msg = 'UDPipe model not found!'
                logging.critical(msg)
                raise IOError(msg)

            self.udpipe_model = Model.load(udpipe_model_path)

        t = time()
        process_pipeline = Pipeline(self.udpipe_model, 'tokenize',
                                    Pipeline.DEFAULT, Pipeline.DEFAULT,
                                    'conllu')

        result = []
        for line in nltk.sent_tokenize(text):
            # line = unify_sym(line.strip()) # здесь могла бы быть ваша функция очистки текста
            output = lemmatize(process_pipeline, text=line)
            result.extend(output)

        self.tagged_counter += 1
        log(f'{self.tagged_counter} of {self.tagged_max} created, for {round(time() - t, 2)}s'
            )

        return result
Ejemplo n.º 6
0
    def __init__(self, lang):
        """Load UDPipe model for given language.

        lang (unicode): ISO 639-1 language code or shorthand UDPipe model name.
        RETURNS (UDPipeModel): Language specific UDPipeModel.
        """
        path = get_path(lang)
        self.model = Model.load(path)
        if not self.model:
            msg = "Cannot load UDPipe model from " \
                  "file '{}'".format(path)
            raise Exception(msg)
        self._lang = lang.split('-')[0]
        self._meta = {
            'authors': ("Milan Straka, "
                        "Jana Straková"),
            'description': "UDPipe pretrained model.",
            'email': '*****@*****.**',
            'lang': 'udpipe_' + self._lang,
            'license': 'CC BY-NC-SA 4.0',
            'name': path.split('/')[-1],
            'parent_package': 'spacy_udpipe',
            'pipeline': 'Tokenizer, POS Tagger, Lemmatizer, Parser',
            'source': 'Universal Dependencies 2.4',
            'url': 'http://ufal.mff.cuni.cz/udpipe',
            'version': '1.2.0'
        }
Ejemplo n.º 7
0
def udpipeG(pathmodel):
    path = "/home/guido/Progetto Unitexto/textdata/cleanedTxt/"
    model = Model.load(pathmodel)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel)
        sys.exit(1)
        sys.stderr.write('done\n')

    pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT,
                        Pipeline.DEFAULT, "conllu")
    error = ProcessingError()
    #   corp = io.open("/home/guido/Progetto Unitexto/textdata/corpus.txt","r",encoding= "utf-8")
    # Read whole input
    #  string="".join(corp.readlines())

    # Process data
    # processed = pipeline.process(string, error)

    f = open("/home/guido/Progetto Unitexto/textdata/corpus.conllu", "a")
    f.truncate(0)
    i = 1
    for filename in os.listdir(path):
        text = io.open(path + filename, "r", encoding="utf-8")
        string = "".join(text.readlines())
        # Process data
        processed = pipeline.process(string, error)
        if error.occurred():
            sys.stderr.write("An error occurred when running run_udpipe: ")
            sys.stderr.write(error.message)
            sys.stderr.write("\n")
            sys.exit(1)
        f.write(processed)

        print("File n ", i, " processed of ", len(os.listdir(path)))
        i += 1
Ejemplo n.º 8
0
def parse(text, sentence_id):
    """Takes a sentence in raw text and produces
	its CoNLL-U annotation by invoking udpipe

	Paratemeters: text - the sentence to be parsed
				  sentence_id - the ID of the sentence

	Output: a UD graph
	"""
    model = Model.load('./models/udpipe/english-ewt-ud-2.3-181115.udpipe')

    tokenizer = model.newTokenizer(model.TOKENIZER_PRESEGMENTED)
    # tokenizer = model.TOKENIZER_PRESEGMENTED(model.DEFAULT)

    conlluOutput = OutputFormat.newOutputFormat("conllu")

    sentence = Sentence()

    error = ProcessingError()

    tokenizer.setText(text)

    tokenizer.nextSentence(sentence, error)

    model.tag(sentence, model.DEFAULT)

    model.parse(sentence, model.DEFAULT)

    return conlluOutput.writeSentence(sentence).replace(
        '# sent_id = 1', '# sent_id = ' + sentence_id)
Ejemplo n.º 9
0
def udpipe(sentences, model_name, verbose=False):
    """
    Parse text to Universal Dependencies using UDPipe.
    :param sentences: iterable of iterables of strings (one string per line)
    :param model_name: filename containing UDPipe model to load
    :param verbose: print extra information
    :return: iterable of lines containing parsed output
    """
    from ufal.udpipe import Model, Pipeline, ProcessingError
    model = Model.load(model_name)
    if not model:
        raise ValueError("Invalid model: '%s'" % model_name)
    pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT,
                        "conllu")
    lines1, lines2 = tee(l for s in sentences for l in s)
    text = "\n".join(lines1)
    error = ProcessingError()
    num_tokens = sum(1 for l in lines2 if l)
    with ioutil.external_write_mode():
        print("Running %s on %d tokens... " % (model_name, num_tokens),
              end="",
              flush=True)
    start = time()
    processed = pipeline.process(text, error)
    duration = time() - start
    with ioutil.external_write_mode():
        print("Done (%.3fs, %.0f tokens/s)" %
              (duration, num_tokens / duration if duration else 0))
        if verbose:
            print(processed)
    if error.occurred():
        raise RuntimeError(error.message)
    return processed.splitlines()
Ejemplo n.º 10
0
def parse_file(text):
    from ufal.udpipe import Model, Pipeline
    model_path = MODELS_DIR + MODEL_NAMES['russian']
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT,
                        Pipeline.DEFAULT, 'conllu')
    return process_udpipe(text, pipeline)
Ejemplo n.º 11
0
def udpipeS(pathmodel, sourcepath, pathdestination):
    model = Model.load(pathmodel)
    if not model:
        sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel)
        sys.exit(1)
        sys.stderr.write('done\n')

    pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT,
                        Pipeline.DEFAULT, "conllu")
    error = ProcessingError()

    i = 1
    for filename in os.listdir(sourcepath):
        f = open(pathdestination + filename[:-3] + "conllu", "a")
        f.truncate(0)

        text = io.open(sourcepath + filename, "r", encoding="utf-8")
        string = "".join(text.readlines())
        # Process data
        processed = pipeline.process(string, error)
        if error.occurred():
            sys.stderr.write("An error occurred when running run_udpipe: ")
            sys.stderr.write(error.message)
            sys.stderr.write("\n")
            sys.exit(1)
        f.write(processed)
        f.close()

        print("File n ", i, " processed of ", len(os.listdir(sourcepath)))
        i += 1
Ejemplo n.º 12
0
    def load_file(self, name, filename, lang):

        if filename not in tronco_special_files:
            filename_dir = os.path.join(root_path, "corpora", name, filename)
            if not lang in self.models:
                self.models[lang] = Model.load(
                    os.path.join(root_path, "udpipe",
                                 udpipe_models[lang]['path']))
            pipeline = Pipeline(self.models[lang], "tokenize",
                                Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
            with open(filename_dir) as f:
                try:
                    text = f.read().splitlines()
                except:
                    return False

            raw_text = []
            metadata = {'filename': filename}
            [
                metadata.update({
                    x.split(" = ", 1)[0].split("# ", 1)[1]:
                    x.split(" = ", 1)[1]
                }) if x.strip().startswith("# ") and " = " in x else
                raw_text.append(x) for x in text
            ]

            if not name in self.files:
                self.files[name] = {}
            self.files[name][filename] = pipeline.process(
                "\n".join(raw_text)).replace("# newdoc\n",
                                             "").replace("# newpar\n", "")
            if not name in self.metadata:
                self.metadata[name] = {}
            self.metadata[name][filename] = metadata
Ejemplo n.º 13
0
    def __init__(self,
                 lang: str,
                 path: Optional[str] = None,
                 meta: Optional[Dict] = None):
        """Load UDPipe model for given language.

        lang: ISO 639-1 language code or shorthand UDPipe model name.
        path: Path to UDPipe model.
        meta: Meta-information about the UDPipe model.
        """
        path = path or get_path(lang=lang)
        self.model = Model.load(path)
        self._lang = lang.split("-")[0]
        self._meta = meta or {
            "author": "Milan Straka & Jana Straková",
            "description": "UDPipe pretrained model.",
            "email": "*****@*****.**",
            "lang": f"udpipe_{self._lang}",
            "license": "CC BY-NC-SA 4.0",
            "name": path.split("/")[-1],
            "parent_package": "spacy_udpipe",
            "pipeline": ["Tokenizer", "Tagger", "Lemmatizer", "Parser"],
            "source": "Universal Dependencies 2.5",
            "url": "http://ufal.mff.cuni.cz/udpipe",
            "version": "1.2.0"
        }
Ejemplo n.º 14
0
def tag_ud(text='Текст нужно передать функции в виде строки!',
           modelfile='udpipe_syntagrus.model'):
    model = Model.load(modelfile)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    processed = pipeline.process(
        text)  # обрабатываем текст, получаем результат в формате conllu
    output = [l for l in processed.split('\n') if not l.startswith('#')
              ]  # пропускаем строки со служебной информацией
    tagged = [
        w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w
    ]  # извлекаем из обработанного текста лемму и тэг
    # tagged_propn = []
    # propn  = []
    # for t in tagged:
    #	if t.endswith('PROPN'):
    #		if propn:
    #			propn.append(t)
    #		else:
    #			propn = [t]
    #	else:
    #		if len(propn) > 1:
    #			for x in propn:
    #				#name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
    #				tagged_propn.append(x)
    #		elif len(propn) == 1:
    #			tagged_propn.append(propn[0])
    #		tagged_propn.append(t)
    #		propn = []
    return tagged
Ejemplo n.º 15
0
def load_nli_data(path, snli=False, udpipe_path=None, seq_length=50, r=10, cache_file=''):
    """
    Load MultiNLI or SNLI data.
    If the 'snli' parameter is set to True, a genre label of snli will be assigned to the data. 
    """
    global is_snli, pipeline, error, pr_seq_length, pr_r
    is_snli = snli
    pr_r = r
    pr_seq_length = seq_length
    pipeline = None
    print(path)
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            data = [w for w in pickle.load(f) if w is not None]
    else:
        if udpipe_path:
            model = Model.load(udpipe_path)
            pipeline = Pipeline(model, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
            error = ProcessingError()

        with open(path) as f:
            pool = Pool(32)
            data = pool.map_async(process_line, list(f), chunksize=1)
            while not data.ready():
                print('{} lines left'.format(data._number_left))
                time.sleep(10)
            data = [w for w in data.get() if w is not None]
            pool.close()
            random.seed(1)
            random.shuffle(data)

        with open(cache_file, 'wb') as f:
            pickle.dump(data, f)

    return data
Ejemplo n.º 16
0
def process_task_stream(sentence):
    from ufal.udpipe import Model, Pipeline
    model_path = MODELS_DIR + MODEL_NAMES[
        'russian']  # language harcoded so far
    model = Model.load(model_path)
    pipeline = Pipeline(model, '', '', '', '')
    print('...loaded the model')
    return process_data(sentence, pipeline)
Ejemplo n.º 17
0
 def __init__(self, model_filename: str) -> None:
     self.model = Model.load(model_filename)
     self.logger = logging.getLogger(self.__class__.__name__)
     if not self.model:
         raise Exception(
             f"Cannot load model from file \"{model_filename}\".")
     with open("resources/predicates.txt") as f:
         self.predicates = {pred.strip().lower() for pred in f}
Ejemplo n.º 18
0
def make_conll_with_udpipe(text):
    model_path = os.path.join(os.getcwd(), 'udparsers',
                              'russian-syntagrus-ud-2.5-191206.udpipe'
                              )  # здесь указать путь к модели
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT,
                        Pipeline.DEFAULT, 'conllu')
    return pipeline.process(text)
Ejemplo n.º 19
0
def make_conll_with_udpipe(text, language='german'):
    if language == 'german':
        model_path = path.join('..', '..', 'udpipe',
                               'german-ud-2.0-170801.udpipe')
    model = Model.load(model_path)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT,
                        'conllu')
    return pipeline.process(text)
    def __init__(self, sourceForInf, sourceForDict, sourceForDL):
        self.nomberOfKnownWords = 0
        self.modelForInfinitive = Model.load(sourceForInf)
        self.vectorOfStr = torch.zeros(2000)
        self.sourceForDL = sourceForDL
        self.net = Net(2000, 1000, 42)
        self.net.load_state_dict(torch.load(sourceForDL, map_location='cpu'))
        self.needToSearch = False
        self.boolClassify = False

        self.clasterOfWord = {}
        with open(sourceForDict) as json_file:
            self.clasterOfWord = json.load(json_file)

        self.idOfThemes = {
            'авто/мото': 0,
            'активный отдых': 1,
            'бизнес': 2,
            'домашние животные': 3,
            'здоровье': 4,
            'знакомство и общение': 5,
            'игры': 6,
            'ИТ (компьютеры и софт)': 7,
            'кино': 8,
            'красота и мода': 9,
            'кулинария': 10,
            'культура и искусство': 11,
            'литература': 12,
            'мобильная связь и интернет': 13,
            'музыка': 14,
            'наука и техника': 15,
            'недвижимость': 16,
            'новости и СМИ': 17,
            'безопасность': 18,
            'образование': 19,
            'обустройство и ремонт': 20,
            'политика': 21,
            'продукты питания': 22,
            'промышленность': 23,
            'путешествия': 24,
            'работа': 25,
            'развлечения': 26,
            'религия': 27,
            'дом и семья': 28,
            'спорт': 29,
            'страхование': 30,
            'телевидение': 31,
            'товары и услуги': 32,
            'увлечения и хобби': 33,
            'финансы': 34,
            'фото': 35,
            'эзотерика': 36,
            'электроника и бытовая техника': 37,
            'эротика': 38,
            'юмор': 39,
            'общество, гуманитарные науки': 40,
            'дизайн и графика': 41
        }
Ejemplo n.º 21
0
def load_data(filename: str,
              lemmatize: bool,
              stopwords_path: str,
              manual_language=None) -> Tuple[List, List]:
    with open(filename, 'r') as f:
        citations = f.read().split('\n')
    stopwords = []
    for stopwords_list in listdir(stopwords_path):
        with open(path.join(stopwords_path, stopwords_list), 'r') as f:
            stopwords.extend(f.read().split('\n'))
    # Download models here: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2998#
    en_model = Model.load(
        path.join('.', 'udpipe', 'english-ewt-ud-2.4-190531.udpipe'))
    en_pipeline = Pipeline(en_model, 'tokenize', Pipeline.DEFAULT,
                           Pipeline.DEFAULT, 'conllu')
    ru_model = Model.load(
        path.join('.', 'udpipe', 'russian-syntagrus-ud-2.4-190531.udpipe'))
    ru_pipeline = Pipeline(ru_model, 'tokenize', Pipeline.DEFAULT,
                           Pipeline.DEFAULT, 'conllu')
    pattern = re.compile('[^a-zа-яA-ZА-Я ]+')
    pattern_brackets = re.compile('[\(\[].*?[\)\]]')
    citations_ids = []
    citations_texts = []
    citation_id_position = 4
    for citation in citations:
        try:
            citation_id = '_'.join(citation.split()[:citation_id_position])
        except IndexError:
            print('Citation ID was not recognized for {}'.format(citation))
            continue
        assert ':' in citation_id, 'Citation ID parsed incorrectly'
        normalized_citation = normalize_sentence(citation, pattern,
                                                 pattern_brackets)
        pipeline = select_lang_pipeline(normalized_citation, en_pipeline,
                                        ru_pipeline, manual_language)
        if not pipeline:
            print('Language has not been detected for {}'.format(citation_id))
            continue
        citations_ids.append(citation_id)
        citations_texts.append(
            preprocess(pipeline,
                       normalized_citation,
                       stopwords,
                       lemmatize=lemmatize))
    return citations_ids, citations_texts
Ejemplo n.º 22
0
 def __init__(self):
     print('Loading model: ')
     model_path = r"D:\py_projects\IWonnaBook\udpipe_syntagrus.model"
     self.model = Model.load(model_path)
     if not self.model:
         print('Модель не загружена :(')
         sys.exit(1)
     self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
     print('done\n')
Ejemplo n.º 23
0
    def __init__(self, seed=42):

        self.morph = pymorphy2.MorphAnalyzer()
        self.model = Model.load("data/udpipe_syntagrus.model".encode())
        self.process_pipeline = Pipeline(self.model, 'tokenize'.encode(), Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'.encode())
        self.seed = seed
        self.init_seed()
        self.paronyms = self.get_paronyms()
        self.freq_bigrams = self.open_freq_grams()
 def load(self, model_path: str):
     logger.info("Loading UdPipe model ...")
     self.model = Model.load(model_path)
     if not self.model:
         raise Exception("Cannot load model from file '%s'." % model_path)
     self.tokenizer = self.model.newTokenizer(self.model.DEFAULT)
     if not self.tokenizer:
         raise Exception("The model does not have a tokenizer")
     self.error = ProcessingError()
Ejemplo n.º 25
0
    def load_model(self):
        model_path = Parser.MODELS.get(self.language, None)
        if not model_path:
            raise ParserException("Cannot find model for language '%s'" % self.language)

        model = Model.load(model_path)
        if not model:
            raise ParserException("Cannot load model from file '%s'\n" % model_path)

        return model
    def get(modelAdd, text):

        from ufal.udpipe import Model, Pipeline, ProcessingError

        error = ProcessingError()
        model = Model.load(modelAdd)
        pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
        parsedArticle = pipeline.process(text, error)

        return parsedArticle
Ejemplo n.º 27
0
 def __init__(self, model):
     """Create the UDPipe tool object."""
     self.model = model
     path = require_file(model)
     self.tool = Model.load(path)
     if not self.tool:
         raise IOError("Cannot load model from file '%s'" % path)
     self.error = ProcessingError()
     self.conllu_reader = ConlluReader()
     self.tokenizer = self.tool.newTokenizer(Model.DEFAULT)
Ejemplo n.º 28
0
 def __init__(self, model):
     """Create the UDPipe tool object."""
     self.model = model
     path = require_file(model)
     self.tool = Model.load(path)
     if not self.tool:
         raise IOError("Cannot load model from file '%s'" % path)
     self.error = ProcessingError()
     self.conllu_reader = ConlluReader()
     self.tokenizer = self.tool.newTokenizer(Model.DEFAULT)
Ejemplo n.º 29
0
def extract_sentences(input_file: str, output_file: str, logger) -> None:

    logger.info(
        "==== Now performing sentence extraction from paragraphs file ====")
    # UDPipe initliazation
    lang_model = 'lang_models/czech-ud-2.0-170801.udpipe'
    model = Model.load(lang_model)
    if not model:
        logger.error('Could not load UDPipe language model: ' + lang_model)
    ud_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT,
                           Pipeline.DEFAULT, '')
    ud_error = ProcessingError()

    sentences_file = open(output_file, "w")
    # reopen paragraphs for reading
    paragraphs_file = open(input_file, "r")

    sentences_count = 0

    for p_line in paragraphs_file:
        page_first_sentence = ""
        page_first_paragraph = p_line.split(
            '\t', 1)  # use the variable as temporary list

        # If there is a paragraph content
        if len(page_first_paragraph) == 2:
            page_uri = page_first_paragraph[0]
            page_first_paragraph = page_first_paragraph[1]
            # Extract first sentence form paragraph using UDPipe:
            ud_output = ud_pipeline.process(page_first_paragraph, ud_error)
            if ud_error.occurred():
                logger.error(
                    'Error occured while extracting sentence using UDPipe: ' +
                    ud_error.message)
                page_first_sentence = ""
            else:
                ud_output = ud_output.split('\n')
                if len(ud_output) >= 4:
                    page_first_sentence = ud_output[3][
                        9:]  # assumption about the output format
                else:
                    page_first_sentence = ""

            # Write sentence to the file
            sentences_file.write(page_uri + '\t' + page_first_sentence + '\n')

            sentences_count += 1
            if sentences_count % 2000 == 0:
                logger.info("Extracted {} sentences.".format(sentences_count))

    logger.info("Finished extraction of {} sentences.".format(sentences_count))

    paragraphs_file.close()
    sentences_file.close()
Ejemplo n.º 30
0
def get_pipeline(modelfile):
    print('\nLoading the model...', file=sys.stderr)
    if not os.path.isfile(modelfile):
        udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
        udpipe_filename = udpipe_model_url.split('/')[-1]
        print('UDPipe model not found. Downloading...', file=sys.stderr)
        wget.download(udpipe_model_url)
    ufal_model = Model.load(modelfile)
    process_pipeline = Pipeline(ufal_model, 'tokenize', Pipeline.DEFAULT,
                                Pipeline.DEFAULT, 'conllu')
    return process_pipeline
Ejemplo n.º 31
0
    def init(self):
        if self.model is None:
            self.model = Model.load(self._model_path)
            if not self.model:
                sys.stderr.write('Cannot load model from file "%s"\n' %
                                 self._model_path)

            self.tagger = Pipeline.DEFAULT if self._enable_tagger else Pipeline.NONE
            self.parser = Pipeline.DEFAULT if self._enable_parser else Pipeline.NONE
            self.error = ProcessingError()
            self.converter_conll = ConverterConllUDV1()
Ejemplo n.º 32
-1
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d','--from_dir_prefix', type = str)
    parser.add_argument('-t','--to_dir_prefix', default = '/tmp/workers', type = str)
    parser.add_argument('-v','--to_vocab_file', default = '/tmp/workers/vocab.txt', type = str)
    parser.add_argument('-u','--path2udp_model', default = './russian-syntagrus-ud-2.0-170801.udpipe', type = str)
    parser.add_argument('-n','--cpu_n', default = 5, type = int)
    parser.add_argument('-T','--timeout_duration', default = 40*60, type = int) # timeout_duration = 40 min
    args = parser.parse_args()

    os.makedirs(args.to_dir_prefix, exist_ok=True)

    model = Model.load(args.path2udp_model)
    udpipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'horizontal')

    files = list(glob(args.from_dir_prefix))

    word_counts = multipd.timeouted_run_pool(files,udpipeline,args.to_dir_prefix, cpu_n=args.cpu_n, timeout_duration=args.timeout_duration)
    word_count = sum(word_counts, collections.Counter())

    vocab = [ '%s\n' % word for word, _ in word_count.most_common()]
    vocab.append(word_count.most_common()[-1][0])
    open(args.to_vocab_file, 'wt').writelines(vocab)

    freqs = [ '{}\t{}\n'.format(word, freq) for word, freq in word_count.most_common()[:-1]]
    open(args.to_vocab_file+'.freqs', 'wt').writelines(freqs)