def __init__(self, address_morph, address_syntax, address_rst): self._ppl = PipelineCommon([ (ProcessorRemote(address_morph[0], address_morph[1], 'default'), ['text'], { 'sentences': 'sentences', 'tokens': 'tokens', 'postag': 'postag', 'lemma': 'lemma' }), (ConverterMystemToUd(), ['postag'], { 'morph': 'morph', 'postag': 'postag' }), (ProcessorRemote(address_syntax[0], address_syntax[1], '0'), ['tokens', 'sentences'], { 'syntax_dep_tree': 'syntax_dep_tree', 'postag': 'ud_postag' }), (ProcessorRemote(address_rst[0], address_rst[1], 'default'), [ 'text', 'tokens', 'sentences', 'lemma', 'morph', 'postag', 'syntax_dep_tree' ], { 'rst': 'rst' }) ]) self._name = 'default'
class PipelineDefault: def __init__(self, address_morph, address_syntax, address_srl): self._ppl = PipelineCommon([ (ProcessorRemote(address_morph[0], address_morph[1], 'default'), ['text'], { 'tokens': 'tokens', 'sentences': 'sentences', 'postag': 'mystem_postag', 'lemma': 'lemma' }), (ProcessorSyntaxNetRemote(address_syntax[0], address_syntax[1]), ['tokens', 'sentences'], { 'syntax_dep_tree': 'syntax_dep_tree' }), (ConverterMystemToUd(), ['mystem_postag'], { 'morph': 'morph', 'postag': 'postag' }), (ProcessorRemote(address_srl[0], address_srl[1], 'default'), ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'], { 'srl': 'srl' }) ]) self._name = 'default' def __call__(self, *args, **kwargs): return self._ppl(*args, **kwargs) def get_processors(self): return self._ppl.get_processors()
def __init__(self, basic_processor=('vmh1.isa.ru', 3344), udpipe_processor=('vmh1.isa.ru', 3355)): self.ppl = WrapperMultiProcessDocument([ PipelineCommon([ ( ProcessorRemote(basic_processor[0], basic_processor[1], 'default'), ['text'], { 'sentences' : 'sentences', 'tokens' : 'tokens', 'postag' : 'mystem_postags', 'lemma' : 'lemma' } ), ( ProcessorRemote(udpipe_processor[0], udpipe_processor[1], '0'), ['tokens', 'sentences'], { 'syntax_dep_tree' : 'syntax_dep_tree' } ), ( ConverterMystemToUd(), ['mystem_postags'], { 'morph' : 'postag', } ) ]) ])
def get_tree(text): from isanlp import PipelineCommon from isanlp.processor_remote import ProcessorRemote from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd from Parser.some_reparser import extract_semantic_relations HOST = 'localhost' proc_morph = ProcessorRemote(HOST, 3333, 'default') proc_syntax = ProcessorRemote(HOST, 3334, '0') syntax_ppl = PipelineCommon([ (proc_morph, ['text'], {'tokens' : 'tokens', 'sentences' : 'sentences', 'postag' : 'postag', 'lemma' : 'lemma'}), (proc_syntax, ['tokens','sentences'], {'syntax_dep_tree' : 'syntax_dep_tree'}), (ConverterMystemToUd(), ['postag'], {'postag' : 'postag', 'morph' : 'morph'}) ]) try: analysis_res = syntax_ppl(text) except: return None sentences = [] for i in analysis_res['sentences']: sentence = [] for j in range(i.begin, i.end): sentence.append(analysis_res['tokens'][j].text) sentences.append(sentence) vertices_list_list = [] relations = extract_semantic_relations(text) for j in range(len(analysis_res['lemma'])): vertices_list = [] for i in range(len(analysis_res['lemma'][j])): start, end = analysis_res['tokens'][i].begin, analysis_res['tokens'][i].end role_vert = [] for rel in relations: if rel['child']['start'] == start and rel['child']['end'] == end: role_vert.append(rel['tp']) vert = tree(word(analysis_res['lemma'][j][i], analysis_res['postag'][j][i], analysis_res['morph'][j][i], start, end, i, role = role_vert)) vertices_list.append(vert) vertices_list_list.append(vertices_list) root_list = [] for i in range(len(vertices_list_list)): list_ = vertices_list_list[i] for j in range(len(analysis_res['syntax_dep_tree'][i])): _ = analysis_res['syntax_dep_tree'][i][j] if _.parent != -1: list_[_.parent].add_child(list_[j], _.link_name) else: list_[j].sentence = sentences[i] root_list.append(list_[j]) return root_list
def create_pipeline(delay_init=False): return PipelineCommon( [(ProcessorGramEval2020('models/ru_bert_final_model'), ['tokens', 'sentences'], { 'lemma': 'lemma', 'postag': 'postag', 'morph': 'morph', 'syntax_dep_tree': 'syntax_dep_tree' })], name='default')
def create_pipeline(delay_init=False): return PipelineCommon([(ProcessorUDPipe('/src/parser_UDPIPE/russian-ud-2.0-170801.udpipe'), ['morph'], {'tokens' : 'tokens', 'sentences' : 'sentences', 'lemma': 'lemma', 'postag' : 'postag', 'morph' : 'morph', 'syntax_dep_tree' : 'syntax_dep_tree'} )], name='default')
def __init__( self, udpipe=("tsa05.isa.ru", 3334), rst=("papertext.ru", 5555), cache_path="./rst-cache.pkl", ): udpipe_host, udpipe_port = udpipe rst_host, rst_port = rst self.cache_path = cache_path self.ppl = PipelineCommon( [ ( ProcessorRemote(udpipe_host, udpipe_port, "0"), ["text"], { "sentences": "sentences", "tokens": "tokens", "lemma": "lemma", "syntax_dep_tree": "syntax_dep_tree", "postag": "ud_postag", }, ), ( ProcessorMystem(delay_init=False), ["tokens", "sentences"], {"postag": "postag"}, ), ( ConverterMystemToUd(), ["postag"], {"morph": "morph", "postag": "postag"}, ), ( ProcessorRemote(rst_host, rst_port, "default"), [ "text", "tokens", "sentences", "postag", "morph", "lemma", "syntax_dep_tree", ], {"clauses": "clauses"}, ), ] ) self.__cache = {} self.__hasher = city_32() if os.path.exists(self.cache_path): self.__cache = jb.load(self.cache_path)
def create_pipeline(delay_init): pipeline_default = PipelineCommon( [(ProcessorRST(model_dir_path='/models', segmenter_type='lstm', span_predictor_type='ensemble', label_predictor_type='ensemble'), [ 'text', 'tokens', 'sentences', 'lemma', 'morph', 'postag', 'syntax_dep_tree' ], { 0: 'rst' })], name='default') return pipeline_default
def __init__(self, ud_model_path: str, lru_cache_size: int = 10000): self.pipeline = PipelineCommon([(ProcessorUDPipe( "./data/models/russian-syntagrus-ud-2.5-191206.udpipe"), ['text'], { "tokens": "tokens", "lemma": "lemma", "postag": "postag", "morph": "morph", "syntax_dep_tree": "syntax_dep_tree" })]) self.predicate_extractor = PredicateExtractor() self.argument_extractor = ArgumentExtractor() self.call_pipeline = lru_cache(lru_cache_size)(self.pipeline.__call__)
def create_pipeline(delay_init): model_path = '/src/bert-base-srl-2019.06.17.tar.gz' tokenizer = ProcessorTokenizerNltkEn() splitter = ProcessorSentenceSplitter() srl_proc = ProcessorSrlAllennlp(model_path) pipeline_default = PipelineCommon([(tokenizer, ['text'], { 0: 'tokens' }), (splitter, ['tokens'], { 0: 'sentences' }), (srl_proc, ['tokens', 'sentences'], { 0: 'srl' })], name='default') return pipeline_default
def get_tree(text): HOST = 'localhost' proc_morph = ProcessorRemote(HOST, 3333, 'default') proc_syntax = ProcessorRemote(HOST, 3334, '0') syntax_ppl = PipelineCommon([(proc_morph, ['text'], { 'tokens': 'tokens', 'sentences': 'sentences', 'postag': 'postag', 'lemma': 'lemma' }), (proc_syntax, ['tokens', 'sentences'], { 'syntax_dep_tree': 'syntax_dep_tree' }), (ConverterMystemToUd(), ['postag'], { 'postag': 'postag', 'morph': 'morph' })]) analysis_res = syntax_ppl(text) sentences = [] for i in analysis_res['sentences']: sentence = [] for j in range(i.begin, i.end): sentence.append(analysis_res['tokens'][j].text) sentences.append(sentence) vertices_list_list = [] for j in range(len(analysis_res['lemma'])): vertices_list = [] for i in range(len(analysis_res['lemma'][j])): vert = tree( word(analysis_res['lemma'][j][i], analysis_res['postag'][j][i], analysis_res['morph'][j][i], i)) vertices_list.append(vert) vertices_list_list.append(vertices_list) root_list = [] for i in range(len(vertices_list_list)): list_ = vertices_list_list[i] for j in range(len(analysis_res['syntax_dep_tree'][i])): _ = analysis_res['syntax_dep_tree'][i][j] if _.parent != -1: list_[_.parent].add_child(list_[j], _.link_name) else: list_[j].sentence = sentences[i] root_list.append(list_[j]) return root_list
def prepare_compounds(compounds_path): ppl = PipelineCommon([(ProcessorTokenizerRu(), ['text'], { 0: 'tokens' }), (ProcessorSentenceSplitter(), ['tokens'], { 0: 'sentences' }), (ProcessorMystem(), ['tokens', 'sentences'], { 'lemma': 'lemma' })]) df_compounds = pd.read_csv(compounds_path) compound_set = set() for i in df_compounds.index: compound = '{} {}'.format(df_compounds.loc[i, 'Часть 1'], df_compounds.loc[i, 'Часть 2']) lemmas = ppl(compound)['lemma'][0] compound_set.add('{}_{}'.format(lemmas[0], lemmas[1])) return compound_set
def acquiring(comp, model, true_label, model_words=None, skip_invalid_labels=True): ppl = PipelineCommon([ (ProcessorTokenizerRu(), ['text'], {0 : 'tokens'}), (ProcessorSentenceSplitter(), ['tokens'], {0 : 'sentences'}), (ProcessorMystem(), ['tokens', 'sentences'], {'lemma' : 'lemma'}) ]) v_w1 = [] v_w2 = [] v_comp = [] true_class = [] if model_words is None: model_words = model indexes = [] for i in comp.index: label = comp.loc[i, true_label] if skip_invalid_labels and label not in {0., 1.}: continue anns = ppl('{} {}'.format(comp.loc[i, 'Часть 1'], comp.loc[i, 'Часть 2']))['lemma'][0] try: #print('{}_{}'.format(anns[0], anns[1])) vec_w1 = model_words[anns[0]] vec_w2 = model_words[anns[1]] vec_comp = model['{}_{}'.format(anns[0], anns[1])] indexes.append(i) except KeyError: continue v_w1.append(vec_w1) v_w2.append(vec_w2) v_comp.append(vec_comp) true_class.append(label) print('Number of examples: ', len(v_w1)) return np.array(v_w1), np.array(v_w2), np.array(v_comp), np.array(true_class), comp.loc[indexes]
import os from isanlp import PipelineCommon from isanlp.processor_remote import ProcessorRemote host = 'localhost' port_morph = int(os.environ['TEST_MORPH_PORT']) port_srl = int(os.environ['TEST_SRL_PORT']) text_path = os.environ['TEST_EN_PATH'] with open(text_path, encoding='utf8') as f: text = f.read() ppl = PipelineCommon([(ProcessorRemote(host=host, port=port_morph, pipeline_name='default'), ['text'], {'tokens': 'tokens', 'sentences': 'sentences', 'lemma': 'lemma', 'postag': 'postag'}), (ProcessorRemote(host=host, port=port_srl, pipeline_name='default'), ['tokens', 'sentences'], {'srl': 'srl'}) ]) annotations = ppl(text)
text_path = os.environ['TEST_PATH'] with open(text_path, encoding='utf8') as f: text = f.read() ppl = PipelineCommon([(ProcessorRemote(host='localhost', port=port_morph, pipeline_name='default'), ['text'], { 'tokens': 'tokens', 'sentences': 'sentences', 'lemma': 'lemma', 'postag': 'mystem_postag' }), (ConverterMystemToUd(), ['mystem_postag'], { 'morph': 'morph', 'postag': 'postag' }), (ProcessorSyntaxNetRemote(host='localhost', port=port_syntax), ['tokens', 'sentences'], { 'syntax_dep_tree': 'syntax_dep_tree' }), (ProcessorRemote(host='localhost', port=port_srl, pipeline_name='default'), ['postag', 'morph', 'lemma', 'syntax_dep_tree'], { 'srl': 'srl' })]) annotations = ppl(text)
from isanlp_srl_framebank.processor_srl_framebank import ProcessorSrlFramebank from isanlp import PipelineCommon PPL_SRL_FRAMEBANK = PipelineCommon( [(ProcessorSrlFramebank('/models', enable_model_for_unknown_predicates=True, known_preds_embeddings_type='elmo', unknown_preds_embeddings_type='elmo', threshold=0.6), ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'], { 0: 'srl' })], name='default')
def __call__(self, tokens, sentences): sys.stderr.write('Processing input...\n') sys.stderr.flush() input_data = [[word.text for word in CSentence(tokens, sent)] for sent in sentences] result_str = self._process_json(json.dumps(input_data)) result_json = json.loads(result_str) result = [] for sent in result_json: result_sent = [] for pred_arg in sent: result_sent.append( Event(pred=(pred_arg[0], pred_arg[0]), args=[ TaggedSpan(arg[0], arg[1], arg[2]) for arg in pred_arg[1] if arg[0] != 'V' ])) result.append(result_sent) return result DEEP_SRL = PipelineCommon([(ProcessorDeepSrlWrapper( "/src/deep_srl/resources/conll05_propid_model", "/src/deep_srl/resources/conll05_model"), ['tokens', 'sentences'], { 0: 'srl' })], name='default')
from isanlp_srl_framebank.processor_srl_framebank import ProcessorSrlFramebank from isanlp import PipelineCommon PPL_SRL_FRAMEBANK = PipelineCommon([(ProcessorSrlFramebank('/models'), ['postag', 'morph', 'lemma', 'syntax_dep_tree'], {0 : 'srl'}) ], name='default')
from processor_rst import ProcessorRST from isanlp import PipelineCommon PPL_RST = PipelineCommon([(ProcessorRST('/models'), [ 'text', 'tokens', 'sentences', 'postag', 'morph', 'lemma', 'syntax_dep_tree' ], { 0: 'rst' })], name='default')