def process(self, doc): if self.config.get('pretokenized'): self.process_pre_tokenized_text(doc) else: # set up batches if self.config.get('lang') == 'vi': # special processing is due for Vietnamese text = '\n\n'.join([x for x in doc.text.split('\n\n')]).rstrip() dummy_labels = '\n\n'.join( ['0' * len(x) for x in text.split('\n\n')]) data = paras_to_chunks(text, dummy_labels) batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True) else: batches = DataLoader(self.config, input_text=doc.text, vocab=self.vocab, evaluation=True) # set up StringIO to get conllu data, run output predictions, set doc's conll file with io.StringIO() as conll_output_string: output_predictions( conll_output_string, self.trainer, batches, self.vocab, None, self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT)) # set conll file for doc doc.conll_file = conll.CoNLLFile( input_str=conll_output_string.getvalue())
def load_file(self, filename, evaluation=False): conll_file = conll.CoNLLFile(filename) if evaluation: data = [[c] for c in conll_file.get_mwt_expansion_cands()] else: data = conll_file.get_mwt_expansions() return conll_file, data
def process_pre_tokenized_text(self, doc): """ Pretokenized text can be provided in 2 manners: 1.) str, tokenized by whitespace, sentence split by newline 2.) list of token lists, each token list represents a sentence generate CoNLL-U output """ conllu_output_string = "" # TODO: This was added for input, that is already in CoNLL-U format. # The conll_file attribute is added manually do the Document instance in that case. if doc.text is None: return if isinstance(doc.text, str): sentences = [ sent.rstrip(' ').split() for sent in doc.text.rstrip('\n').split('\n') if sent ] elif isinstance(doc.text, list): sentences = doc.text for sentence in sentences: for token_id, token in enumerate(sentence): conllu_data = ['_'] * conll.FIELD_NUM conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1) conllu_data[conll.FIELD_TO_IDX['word']] = token conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id) conllu_output_string += ('\t'.join(conllu_data) + '\n') conllu_output_string += '\n' doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
def process_pre_tokenized_text(self, doc): """ Pretokenized text can be provided in 2 manners: 1.) str, tokenized by whitespace, sentence split by newline 2.) list of token lists, each token list represents a sentence generate CoNLL-U output """ conllu_output_string = "" if isinstance(doc.text, str): sentences = [ sent.rstrip(' ').split() for sent in doc.text.rstrip('\n').split('\n') if sent ] elif isinstance(doc.text, list): sentences = doc.text for sentence in sentences: for token_id, token in enumerate(sentence): conllu_data = ['_'] * conll.FIELD_NUM conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1) conllu_data[conll.FIELD_TO_IDX['word']] = token conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id) conllu_output_string += ('\t'.join(conllu_data) + '\n') conllu_output_string += '\n' doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
def process(self, doc): batch = DataLoader(doc, self.config['batch_size'], self.config, vocab=self.vocab, evaluation=True) if len(batch) > 0: dict_preds = self.trainer.predict_dict( batch.conll.get_mwt_expansion_cands()) # decide trainer type and run eval if self.config['dict_only']: preds = dict_preds else: preds = [] for i, b in enumerate(batch): preds += self.trainer.predict(b) if self.config.get('ensemble_dict', False): preds = self.trainer.ensemble( batch.conll.get_mwt_expansion_cands(), preds) else: # skip eval if dev data does not exist preds = [] with io.StringIO() as conll_with_mwt: batch.conll.write_conll_with_mwt_expansions(preds, conll_with_mwt) doc.conll_file = conll.CoNLLFile( input_str=conll_with_mwt.getvalue())
def process_pre_tokenized_text(self, doc): """Assume text is tokenized by whitespace, sentence split by newline, generate CoNLL-U output""" conllu_output_string = "" sentences = [ sent for sent in doc.text.rstrip('\n').split('\n') if sent ] for sentence in sentences: tokens = sentence.rstrip(' ').split(' ') for token_id, token in enumerate(tokens): conllu_data = ['_'] * conll.FIELD_NUM conllu_data[conll.FIELD_TO_IDX['id']] = str(token_id + 1) conllu_data[conll.FIELD_TO_IDX['word']] = token conllu_data[conll.FIELD_TO_IDX['head']] = str(token_id) conllu_output_string += ('\t'.join(conllu_data) + '\n') conllu_output_string += '\n' doc.conll_file = conll.CoNLLFile(input_str=conllu_output_string)
def load_file(self, filename, evaluation=False): conll_file = conll.CoNLLFile(filename) data = conll_file.get(['word', 'upos', 'xpos', 'feats'], as_sentences=True) return conll_file, data
def load_file(self, filename): conll_file = conll.CoNLLFile(filename) data = conll_file.get(['word', 'xpos', 'lemma']) return conll_file, data
def main(): args = parse_args() args = vars(args) print("Running UDPipe with module {}...".format(args['module'])) # convert names short2tb = load_short2tb(args['short2tb']) tb_short = args['treebank'] tb_full = short2tb[tb_short] lang_full = tb_full[3:].split('-')[0].lower() lang_short, tb_code = tb_short.split('_') # look for commands and models udpipe_script = '{}/bin-linux64/udpipe'.format(args['udpipe_dir']) model_name = '{}-{}-ud-2.2-conll18-180430.udpipe'.format( lang_full, tb_code) model_file = '{}/models/{}'.format(args['udpipe_dir'], model_name) if not os.path.exists(model_file): model_name = "mixed-ud-ud-2.2-conll18-180430.udpipe" model_file = '{}/models/{}'.format(args['udpipe_dir'], model_name) # check files if not args['output_file'].endswith('.conllu'): raise Exception("UDPipe module must write to conllu file.") if args['module'] == 'tokenize': # run tokenizer, ssplit and mwt expander at the same time if not args['input_file'].endswith('.txt'): raise Exception( "UDPipe must take txt file as input when module == tokenize.") # run tokenizer from txt file udpipe_cmd = "{} --tokenize {} {} --outfile={} --output=conllu".format( udpipe_script, model_file, args['input_file'], args['output_file']) run_udpipe(udpipe_cmd) print("Waiting for filesystem...") time.sleep(5) else: if not args['input_file'].endswith('.conllu'): raise Exception( "UDPipe must take conllu file as input when module != tokenize." ) # first load the original input file input_conll = conll.CoNLLFile(args['input_file']) input_conll.load_all() # do udpipe if args['module'] == 'parse': udpipe_cmd = "{} --parse {} {} --output=conllu --input=conllu".format( udpipe_script, model_file, args['input_file']) else: udpipe_cmd = "{} --tag {} {} --output=conllu --input=conllu".format( udpipe_script, model_file, args['input_file']) udpipe_outputs = run_udpipe(udpipe_cmd, return_stdout=True) print("Waiting for filesystem...") time.sleep(5) # load conll back and merge with original conll udpipe_conll = conll.CoNLLFile(input_str=udpipe_outputs.decode()) udpipe_conll.load_all() if args['module'] == 'lemma': fields = ['lemma'] elif args['module'] == 'pos': fields = ['upos', 'xpos'] elif args['module'] == 'ufeats': fields = ['feats'] elif args['module'] == 'parse': fields = ['head', 'deprel', 'deps'] else: raise Exception("Module {} not recognized.".format(args['module'])) input_conll.set(fields, udpipe_conll.get(fields)) # set fields back # finally write to file input_conll.write_conll(args['output_file']) print("Waiting for filesystem...") time.sleep(5) print("All done running module {} with UDPipe.".format(args['module']))
from stanfordnlp.models.common import conll # This must run only once... stanfordnlp.download('hu_szeged') nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', lang="hu") # Full pipeline nlp1 = stanfordnlp.Pipeline(processors='tokenize,mwt', lang="hu") # Part I. nlp2 = stanfordnlp.Pipeline(processors='pos,lemma,depparse', lang="hu") # Part II. # Analyze raw string doc = nlp1('Kecském kucorog, macskám mocorog.') # Print result... for i in range(len(doc.sentences)): doc.sentences[i].print_tokens() conllu_format = doc.conll_file.conll_as_string() print(conllu_format) # CoNLL text output... # Documentation: https://stanfordnlp.github.io/stanfordnlp/processors.html # Read CoNLL-U in any stage... doc = stanfordnlp.Document(None) doc.conll_file = conll.CoNLLFile(input_str=conllu_format) # Analyze further and print the result... doc2 = nlp2(doc) print(doc2.conll_file.conll_as_string())