def main(): argparser = parser_skeleton( description='PurePOSPy - a Python wrapper for PurePOS POS-tagger') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['pos'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py em_tag = ('purepospy.purepospy', 'PurePOS', 'emTag (PurePOS)', (), { 'source_fields': {'form', 'anas'}, 'target_fields': ['lemma', 'xpostag'] }) tools = [(em_tag, ('pos', 'emTag'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description= 'emmorph2ud - a script converts the output tag of emMorph morphological' ' analyzer to the corresponding output tag of magyarlanc 3.0') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['emmorph2ud'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py # from emdummy import DummyTagger em_morph2ud = ('emmorph2ud', 'EmMorph2UD', 'emmorph2ud', (), { 'source_fields': {'form', 'lemma', 'xpostag'}, 'target_fields': ['upostag', 'feats'] }) tools = [(em_morph2ud, ('conv-morph', 'emmorph2ud'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton(description='MMeta - a module which add global and per-sentence metadata to tokenized ' 'and lemmatized emtsv output file') opts = argparser.parse_args() # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['mmeta'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py # from emdummy import EmDummy m_meta = ('mmeta', 'MMeta', 'Add metadata', (), {'source_fields': {'form', 'lemma'}, 'target_fields': []}) tools = [(m_meta, ('mmeta', 'mMeta'))] # Run the pipeline on input and write result to the output... output_iterator.writelines(build_pipeline(input_data, used_tools, tools, presets))
def main(): argparser = parser_skeleton(description='emDep - a dependency parser for UD') argparser.add_argument('--maxlen', dest='maxlen', type=int, required=False, default=None, help='Specify the maximum sentence lenght to be parsed', metavar='FILE') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['dep'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py em_depud = ('emdeppy', 'EmDepPy', 'emDep', (), {'maxlen': opts.maxlen, 'source_fields': {'form', 'lemma', 'upostag', 'feats'}, 'target_fields': ['id', 'deprel', 'head']}) tools = [(em_depud, ('dep', 'emDep-ud'))] # Run the pipeline on input and write result to the output... output_iterator.writelines(build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description='emCons - A wrapper implemented in Python for emCons' ' (Berkeley parser a.k.a. Product Parser).') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['cons'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py em_cons = ('emconspy.emconspy', 'EmConsPy', 'emCons', (), { 'source_fields': {'form', 'lemma', 'xpostag'}, 'target_fields': ['cons'] }) tools = [(em_cons, ('cons', 'emCons'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description= 'emCoNLL - a script converting emtsv output to CoNLL-U format') add_bool_arg(argparser, 'print-header', 'Print header') add_bool_arg(argparser, 'force-id', 'Force writing ID field when it is not available') add_bool_arg(argparser, 'add-space-after-no', 'Add SpaceAfter=no to misc when wsafter field present') argparser.add_argument( '--extra-columns', dest='extra_columns', type=str, default=None, help='Add extra columns in key1:val1,key2:val2 format') opts = argparser.parse_args() extra_columns_str = opts.extra_columns extra_columns = {} if extra_columns_str is not None: kws = extra_columns_str.split(',') for kw in kws: k, v = kw.split(':', maxsplit=1) extra_columns[k] = v jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['conll'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py # from emdummy import DummyTagger em_conll = ('emconll.converter', 'EmCoNLL', 'CoNLL-U converter', (opts.print_header, opts.force_id, opts.add_space_after_no, extra_columns), { 'source_fields': {'form'}, 'target_fields': [] }) tools = [(em_conll, ('conll', 'emCoNLL'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description='emIOBUtils - an IOB style converter and corrector') argparser.add_argument( '--input-field-name', help='The name of the input field to convert or correct', required=True, metavar='FIELD-NAME') argparser.add_argument( '--output-field-name', help='The name of the output field (must be unique)', required=True, metavar='FIELD-NAME') argparser.add_argument('--output-style', help='The name of the output span notation style', required=True, choices={ 'iob1', 'iob2', 'bio', 'ioe1', 'ioe2', 'io', 'sbieo', 'iobes', 'iobe1', 'noprefix', 'bilou', 'IOB1', 'IOB2', 'BIO', 'IOE1', 'IOE2', 'IO', 'SBIEO', 'IOBES', 'IOBE1', 'NOPREFIX', 'BILOU' }, metavar='STYLE') opts = argparser.parse_args() # TODO: Add multiple modes... jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['iobconv'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py em_iobutils = ('emiobutils', 'EmIOBUtils', 'IOB style converter and corrector (EmIOBUtils)', (), { 'out_style': opts.output_style, 'source_fields': {opts.input_field_name}, 'target_fields': [opts.output_field_name] }) tools = [(em_iobutils, ('iobconv', 'emiobutils'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description='Hunspell integrated with the xtsv framework') add_bool_arg( argparser, 'raw', 'Process tokens raw one token per line (without xtsv) incl. interactive mode' ) add_bool_arg(argparser, 'test', 'Run predefined test') opts = argparser.parse_args() if opts.test: test() exit() if opts.raw: raw_input_processor(opts.input_stream) exit() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. conll_comments = opts.conllu_comments # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['spell'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py hunspellpy = ('hunspellpy', 'HunspellPy', 'HunspellPy', (), { 'source_fields': {'form'}, 'target_fields': ['spell', 'hunspell_anas'] }) tools = [(hunspellpy, ('spell', 'hunspell'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, conll_comments))
def main(): argparser = parser_skeleton( description= 'emMorphPy - A wrapper, a lemmatizer and REST API implemented in Python for' ' emMorph (Humor) Hungarian morphological analyzer') add_bool_arg( argparser, 'raw', 'Process tokens raw one token per line (without xtsv) incl. interactive mode' ) opts = argparser.parse_args() if opts.raw: raw_input_processor(opts.input_stream) exit() # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['morph'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py em_morph = ('emmorphpy', 'EmMorphPy', 'emMorph', (), { 'source_fields': {'form'}, 'target_fields': ['anas'] }) tools = [(em_morph, ('morph', 'emMorph'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description='emZero - a module for marking zero pronouns' ' in dependency parsed sentences') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['zero'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py # from emdummy import DummyTagger em_zero = ('emzero', 'EmZero', 'Inserts zero pronouns (subjects, objects and possessors) ' 'into dependency parsed texts', (), { 'source_fields': { 'form', 'lemma', 'xpostag', 'upostag', 'feats', 'id', 'head', 'deprel' }, 'target_fields': [] }) tools = [(em_zero, ('zero', 'emZero'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description='emUDPipe - An UDPipe wrapper for e-magyar (xtsv).') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['udpipe-tok-parse'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py emudpipe_tok_parse = ( 'emudpipe.emudpipe', 'UDPipe', 'UDPipe tokenizer, POS tagger and dependency parser as a whole', (), { 'task': 'tok-parse', 'source_fields': set(), 'target_fields': ['form', 'lemma', 'upostag', 'feats', 'head', 'deprel', 'deps'] }) tools = [(emudpipe_tok_parse, ('udpipe-tok-parse', ))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton( description='EmDummy - a template module for xtsv') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['dummy'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py # from emdummy import EmDummy em_dummy = ( 'emdummy', 'EmDummy', 'EXAMPLE (The friendly name of EmDummy used in REST API form)', ('Params', 'goes', 'here'), { 'source_fields': {'form'}, # Source field names 'target_fields': ['star'] }) # Target field names tools = [(em_dummy, ('dummy', 'dummy-tagger', 'emDummy'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): """ - beolvassa a korpuszt - meghívja a dictionary-gyártó függvényt a termek beolvasásához - a dictionary-t és a korpuszt átadja a korpusz-feldolgozó függvénynek - kiírja a korpuszt """ argparser = parser_skeleton( description= 'emTerm - a module for marking single word and multi-word units ' 'in POS-tagged text') argparser.add_argument('--term-list', dest='term_list', type=FileType(), required=True, help='Specify the terminology dictionary file', metavar='FILE') argparser.add_argument( '--counter-marker', dest='counter_marker', type=str, default=':', help='Specify counter marker separator (default: :)') argparser.add_argument('--termid-separator', dest='termid_separator', type=str, default='×', help='Specify termid separator (default: ×)') argparser.add_argument('--term-separator', dest='term_separator', type=str, default=';', help='Specify term separator (default: ;)') argparser.add_argument('--list-mwe-separator', dest='list_mwe_separator', type=str, default='@', help='Specify list mwe separator (default: @)') argparser.add_argument( '--placeholder', dest='placeholder', type=str, default='_', help='Specify placeholder for empty fields (default: _)') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['term'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py em_term = ('emterm', 'EmTerm', 'Mark single word and multi-word units in POS-tagged text', (opts.term_list, opts.counter_marker, opts.termid_separator, opts.term_separator, opts.list_mwe_separator, opts.placeholder), { 'source_fields': {'form', 'lemma'}, 'target_fields': ['term'] }) tools = [(em_term, ('term', 'emTerm'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets))
def main(): argparser = parser_skeleton( description='HunTag3 - A sequential tagger for NLP combining' ' the Scikit-learn/LinearRegressionClassifier linear classifier' ' and Hidden Markov Models') opts = parse_args(argparser) jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: print('Sorry, --text is not available!', file=sys.stderr) sys.exit(1) else: input_data = opts.input_stream output_iterator = opts.output_stream options = vars(opts) # Set the tagger name as in the tools dictionary used_tools = ['huntag'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py huntag_tagger = ('huntag', 'Tagger', 'HunTag3 (emNER, emChunk)', (options, ), { 'source_fields': set(), 'target_fields': [opts.label_tag_field] }) tools = [(huntag_tagger, ('huntag', 'HunTag3'))] if options['task'] == 'transmodel-train': # TRANSMODEL TRAIN trans_model = TransModel(source_fields={options['gold_tag_field']}, lmw=options['lmw'], order=options['transmodel_order']) # It's possible to train multiple times incrementally... (Just call process on different data, then compile()) # Exhaust training process iterator... for _ in process(input_data, trans_model): pass # Close training, compute probabilities trans_model.compile() trans_model.save_to_file(options['transmodel_filename']) elif options['task'] in { 'train', 'most-informative-features', 'train-featurize' }: # TRAIN trainer = Trainer(options, source_fields={options['gold_tag_field']}) # Exhaust training process iterator... for _ in process(input_data, trainer): pass trainer.cutoff_feats() if options['task'] == 'most-informative-features': trainer.most_informative_features(output_iterator) elif options['task'] == 'train-featurize': trainer.write_featurized_input(output_iterator) else: trainer.train() trainer.save() elif options['task'] in {'print-weights', 'tag-featurize' }: # TAG (minus real tagging handled by xtsv) tagger = Tagger(options, target_fields=[options['label_tag_field']]) if options[ 'io_dirs'] is not None: # Tag all files in a directory file to to filename.tagged inp_dir, out_dir = options['io_dirs'] for fn in listdir(inp_dir): print('processing file {0}...'.format(fn), end='', file=sys.stderr, flush=True) with open(os_path_join(inp_dir, fn), encoding='UTF-8') as ifh, \ open(os_path_join(out_dir, '{0}.tagged'.format(fn)), 'w', encoding='UTF-8') as ofh: ofh.writelines(process(ifh, tagger)) elif options[ 'task'] == 'print-weights': # Print MaxEnt weights to output stream tagger.print_weights(output_iterator, options['num_weights']) else: # options['task'] == tag # Tag a featurized or unfeaturized file or write the featurized format to to output_stream # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton(description='emStanza - Stanza fitted to xtsv') argparser.add_argument( '--task', dest='emstanza_task', required=True, help='Task to do (tok, pos, lem, parse, tok-pos, tok-parse, etc.') opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['stanza'] presets = [] # Init and run the module as it were in xtsv # The relevant part of config.py # from emdummy import EmDummy available_tasks = { 'tok': { 'task': 'tok', 'source_fields': set(), 'target_fields': ['form', 'wsafter'] }, 'tok-pos': { 'task': 'tok-pos', 'source_fields': set(), 'target_fields': ['form', 'wsafter', 'feats', 'upostag', 'xpostag'], }, 'tok-lem': { 'task': 'tok-lem', 'source_fields': set(), 'target_fields': ['form', 'wsafter', 'feats', 'upostag', 'xpostag', 'lemma'], }, 'tok-parse': { 'task': 'tok-parse', 'source_fields': set(), 'target_fields': [ 'form', 'wsafter', 'feats', 'upostag', 'xpostag', 'lemma', 'id', 'deprel', 'head' ], }, 'parse': { 'task': 'parse', 'source_fields': {'form', 'lemma', 'upostag', 'feats'}, 'target_fields': ['id', 'deprel', 'head'], }, 'pos': { 'task': 'pos', 'source_fields': {'form'}, 'target_fields': ['upostag', 'xpostag', 'feats'] }, 'pos,lem': { 'task': 'pos,lem', 'source_fields': {'form'}, 'target_fields': ['upostag', 'xpostag', 'feats', 'lemma'] } } if opts.emstanza_task not in available_tasks.keys(): raise ValueError( f'task parameter must be one of {available_tasks.keys()} !') emstanza = ( 'emstanza', 'EmStanza', 'Processing with Stanza', (), available_tasks[opts.emstanza_task], ) # Target field names tools = [(emstanza, ('emstanza', 'stanza', 'emStanza'))] # Run the pipeline on input and write result to the output... output_iterator.writelines( build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))
def main(): argparser = parser_skeleton(description='EmPhon - a phonetic transcriber module for xtsv') add_bool_arg(argparser, 'ipaize', ('Whether the output should be IPA or the emPhon inner representation, ' 'which marks one phone with exactly one letter.'), default=True, has_negative_variant=True) add_bool_arg( argparser, 'opt-palatal-assim', ('Whether optional palatal assimilation should happen with t/d+ny, e.g. lapátnyél -> lapátynyél'), default=False, has_negative_variant=True) add_bool_arg( argparser, 'include-sentence', 'If on, there is a line of comment before the sentence that contains the entire surface form of the sentence.', default=True, has_negative_variant=True) opts = argparser.parse_args() jnius_config.classpath_show_warning = opts.verbose # Suppress warning. # Set input and output iterators... if opts.input_text is not None: input_data = opts.input_text else: input_data = opts.input_stream output_iterator = opts.output_stream # Set the tagger name as in the tools dictionary used_tools = ['emphon'] presets = [] # Init and run the module as it were in xtsv emphon = ('emphon', 'EmPhon', 'EmPhon', (), {'source_fields': {'form', 'anas'}, 'target_fields': ['phon'], 'include_sentence': opts.include_sentence, 'transcriber_opts': {'ipaize': opts.ipaize, 'optional_palatal_assimilation': opts.opt_palatal_assim}, }, ) emphon_ipa_comments = ('emphon', 'EmPhon', 'emPhon phonetic transcriber with IPAization and with comment lines', (), {'source_fields': {'form', 'anas'}, 'target_fields': ['phon'], 'include_sentence': True, 'transcriber_opts': {'ipaize': True, 'optional_palatal_assimilation': False}, }, ) emphon_noipa_comments = ('emphon', 'EmPhon', 'emPhon phonetic transcriber without IPAization but with comment lines', (), {'source_fields': {'form', 'anas'}, 'target_fields': ['phon'], 'include_sentence': True, 'transcriber_opts': {'ipaize': False, 'optional_palatal_assimilation': False}, },) emphon_ipa_nocomments = ('emphon', 'EmPhon', 'emPhon phonetic transcriber with IPAization but without comment lines', (), {'source_fields': {'form', 'anas'}, 'target_fields': ['phon'], 'include_sentence': False, 'transcriber_opts': {'ipaize': True, 'optional_palatal_assimilation': False}, },) emphon_noipa_nocomments = ( 'emphon', 'EmPhon', 'emPhon phonetic transcriber without IPAization and comment lines', (), {'source_fields': {'form', 'anas'}, 'target_fields': ['phon'], 'include_sentence': False, 'transcriber_opts': {'ipaize': False, 'optional_palatal_assimilation': False}, },) tools = [(emphon, ('emphon', 'emPhon phonetic transcriber ', 'emPhon'))] available_tools = [ (emphon_ipa_comments, ('emphon-ipa-comments', 'emPhon-ipa-comments', 'emPhon-IPA-comments')), (emphon_ipa_nocomments, ('emphon-ipa-nocomments', 'emPhon-ipa-nocomments', 'emPhon-IPA-nocomments')), (emphon_noipa_comments, ('emphon-noipa-comments', 'emPhon-noipa-comments', 'emPhon-noIPA-comments')), (emphon_noipa_nocomments, ('emphon-noipa-nocomments', 'emPhon-noipa-nocomments', 'emPhon-noIPA-nocomments'))] # Run the pipeline on input and write result to the output... output_iterator.writelines(build_pipeline(input_data, used_tools, tools, presets, opts.conllu_comments))