Ejemplo n.º 1
0
def load_parser(gpu=True):
    import types
    extra_args=types.SimpleNamespace()
    if gpu:
        extra_args.__dict__["udify_mod.device"]="0" #simulates someone giving a --device 0 parameter to Udify
        extra_args.__dict__["lemmatizer_mod.device"]="0"
    available_pipelines=read_pipelines("models_fi_tdt_v2.7/pipelines.yaml")        # {pipeline_name -> its steps}
    turku_parser=Pipeline(available_pipelines["parse_plaintext"], extra_args)         # launch the pipeline from the steps
    return turku_parser
Ejemplo n.º 2
0
        pipeline = pipelines[args.action]

    if pipeline[0].startswith("extraoptions"):
        extraoptions = pipeline[0].split()[1:]
        pipeline.pop(0)
        newoptions = extraoptions + sys.argv[1:]
        print("Got extra arguments from the pipeline, now running with",
              newoptions,
              file=sys.stderr,
              flush=True)
        args = argparser.parse_args(newoptions)

    #args.__dict__["lemmatizer_mod.device"]=-1 #args.device force lemmatizer onto CPU

    pipeline.append("output_mod")
    p = Pipeline(steps=pipeline, extra_args=args)

    print("Waiting for input", file=sys.stderr, flush=True)
    comment_regex = re.compile("^####?\s?C:")
    line_buffer = []
    for line in sys.stdin:
        line_buffer.append(line)
        if not comment_regex.match(line) and (
                line.strip() == "" or not args.empty_line_batching
        ) and len(line_buffer) > args.batch_lines and batch_endswith_text(
                line_buffer):
            if not p.is_alive():  #gotta end if something dies
                print("Something crashed. Exiting.",
                      file=sys.stderr,
                      flush=True)
                sys.exit(-1)
Ejemplo n.º 3
0
    "/home/jmnybl/git_checkout/Turku-neural-parser-pipeline-modularize")
from tnparser.pipeline import read_pipelines, Pipeline

ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)

# GPU
import types

extra_args = types.SimpleNamespace()
extra_args.__dict__[
    "udify_mod.device"] = "0"  #simulates someone giving a --device 0 parameter to Udify
extra_args.__dict__["lemmatizer_mod.device"] = "0"

available_pipelines = read_pipelines(
    "models_fi_tdt_v2.7/pipelines.yaml")  # {pipeline_name -> its steps}
p = Pipeline(available_pipelines["parse_plaintext"]
             )  # launch the pipeline from the steps


def parse(txt):

    txt_parsed = p.parse(txt)  # txt be a paragraph
    sents = []
    tokens = []
    lemmas = []
    txt_parsed = txt_parsed.split("\n\n")
    for sent_parsed in txt_parsed:
        lemma_sent = []
        for line in sent_parsed.split("\n"):
            line = line.strip()
            if not line:
                continue
    general_group.add_argument('--host',default="localhost",help="Host on which to bind. Default %(default)s")
    general_group.add_argument('--max-char', default=0, type=int, help='Number of chars maximum in a job batch. Cuts longer. Zero for no limit. Default %(default)d')
    
    lemmatizer_group = argparser.add_argument_group(title='lemmatizer_mod', description='Lemmatizer arguments')
    lemmatizer_group.add_argument('--gpu', dest='lemmatizer_mod.gpu', type=int, default=0, help='GPU device id for the lemmatizer, if -1 use CPU')
    lemmatizer_group.add_argument('--batch_size', dest='lemmatizer_mod.batch_size', type=int, default=100, help='Lemmatizer batch size')

    args = argparser.parse_args()

    pipelines = read_pipelines(args.conf_yaml)

    if args.action=="list":
        print(sorted(pipelines.keys()),file=sys.stderr,flush=True)
        sys.exit(0)
    else:
        pipeline=pipelines[args.action]

    if pipeline[0].startswith("extraoptions"):
        extraoptions=pipeline[0].split()[1:]
        pipeline.pop(0)
        newoptions=extraoptions+sys.argv[1:]
        print("Got extra arguments from the pipeline, now running with", newoptions, file=sys.stderr, flush=True)
        args=argparser.parse_args(newoptions)

    p=Pipeline(steps=pipeline, extra_args=args)

    app.run(host=args.host,port=args.port,threaded=True,processes=1,use_reloader=False)
            
            
        
#!/usr/bin/env python
import yaml
import os
import flask
import sys
from tnparser.pipeline import Pipeline, read_pipelines

app = flask.Flask(__name__)
model = os.environ.get("TNPP_MODEL", "models_fi_tdt/pipelines.yaml")
pipeline = os.environ.get("TNPP_PIPELINE", "parse_plaintext")
max_char = int(os.environ.get("TNPP_MAX_CHARS", 15000))
available_pipelines = read_pipelines(model)
p = Pipeline(available_pipelines[pipeline])


@app.route("/", methods=["GET"])
def parse_get():
    global p
    txt = flask.request.args.get("text")
    if not txt:
        return "You need to specify ?text=sometext", 400
    res = p.parse(txt)
    return flask.Response(res, mimetype="text/plain; charset=utf-8")


@app.route("/", methods=["POST"])
def parse_post():
    global p, max_char
    txt = flask.request.get_data(as_text=True)
    if max_char > 0:
        txt = txt[:max_char]
Ejemplo n.º 6
0
from tnparser.pipeline import read_pipelines, Pipeline
import json
import tqdm
import argparse
import sys

ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)

# GPU
import types
extra_args=types.SimpleNamespace()
extra_args.__dict__["udify_mod.device"]="0" #simulates someone giving a --device 0 parameter to Udify
extra_args.__dict__["lemmatizer_mod.device"]="0"

available_pipelines=read_pipelines("models_fi_tdt_v2.7/pipelines.yaml")        # {pipeline_name -> its steps}
turku_segmenter=Pipeline(available_pipelines["tokenize"])         # launch the pipeline from the steps

conllu_pipeline = available_pipelines["parse_conllu"]
if conllu_pipeline[0].startswith("extraoptions"):
    extraoptions=conllu_pipeline[0].split()[1:] # ['--empty-line-batching']
    conllu_pipeline.pop(0)
    extra_args.__dict__["empty_line_batching"]=True
turku_parser=Pipeline(conllu_pipeline, extra_args)



def read_conllu(txt):
    sent=[]
    comment=[]
    for line in txt.split("\n"):
        line=line.strip()
Ejemplo n.º 7
0
from tnparser.pipeline import read_pipelines, Pipeline

text1 = "I have a dog! Let's see what I can do with Silo.ai. :) Can I tokenize it? I think so! Heading: This is the heading And here continues a new sentence and there's no dot."
text2 = "Some other text, to see we can tokenize more stuff without reloading the model... :)"

# What do we have for English in models_en_ewt?
available_pipelines = read_pipelines(
    "models_en_ewt/pipelines.yaml")  # {pipeline_name -> its steps}
p = Pipeline(
    available_pipelines["tokenize"])  # launch the pipeline from the steps

for _ in range(1000):
    print(p.parse(text1))
    print(p.parse(text2))