コード例 #1
0
ファイル: nnclient.py プロジェクト: reynirf/Greynir
 def request_segmented(cls,
                       sent_map,
                       src_lang=None,
                       tgt_lang=None,
                       verbatim=False):
     """ Translate presegmented sentences
         args:
             sent_map: either a list of sentences or a dict[key] of sentences"""
     data = dict(src_lang=src_lang, tgt_lang=tgt_lang)
     if isinstance(sent_map, dict):
         sents = ([
             tokenizer.correct_spaces(sent) for sent in sent_map.values()
         ] if not verbatim else list(sent_map.values()))
         result = TranslateClient._request(sents, data=data)
         inst_map = {
             idx: inst
             for (idx, inst) in zip(sent_map.keys(), result)
         }
         resp = dict(results=inst_map)
     else:
         sents = ([tokenizer.correct_spaces(sent)
                   for sent in sent_map] if not verbatim else sent_map)
         result = TranslateClient._request(sents, data=data)
         inst_map = {idx: inst for (idx, inst) in enumerate(result)}
         resp = dict(results=inst_map)
     return resp
コード例 #2
0
ファイル: test_tokenizer.py プロジェクト: sverrirab/Tokenizer
def test_correct_spaces():
    s = t.correct_spaces(
        "Frétt \n  dagsins:Jón\t ,Friðgeir og Páll ! 100,8  /  2  =   50.4")
    assert s == 'Frétt dagsins: Jón, Friðgeir og Páll! 100,8/2 = 50.4'
    s = t.correct_spaces(
        "Hitinn    var\n-7,4 \t gráður en   álverðið var  \n $10,348.55.")
    assert s == 'Hitinn var -7,4 gráður en álverðið var $10,348.55.'
    s = t.correct_spaces(
        "\n Breytingin var   +4,10 þingmenn \t  en dollarinn er nú á €1,3455  ."
    )
    assert s == 'Breytingin var +4,10 þingmenn en dollarinn er nú á €1,3455.'
コード例 #3
0
def test_correct_spaces():
    s = t.correct_spaces(
        "Frétt \n  dagsins:Jón\t ,Friðgeir og Páll ! 100,8  /  2  =   50.4")
    assert s == "Frétt dagsins: Jón, Friðgeir og Páll! 100,8/2 = 50.4"
    s = t.correct_spaces(
        "Hitinn    var\n-7,4 \t gráður en   álverðið var  \n $10,348.55.")
    assert s == "Hitinn var -7,4 gráður en álverðið var $10,348.55."
    s = t.correct_spaces(
        "\n Breytingin var   +4,10 þingmenn \t  en dollarinn er nú á €1,3455  ."
    )
    assert s == "Breytingin var +4,10 þingmenn en dollarinn er nú á €1,3455."
    s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.")
    assert s == "Jón-sem var formaður—mótmælti málinu."
    s = t.correct_spaces("Það á   að geyma mjólkina við  20 ±  3 °C")
    assert s == "Það á að geyma mjólkina við 20±3° C"
コード例 #4
0
def index_text(text, correct_spaces: bool = False):
    """Segments contiguous (Icelandic) text into paragraphs and sentences
    and returns:
        dictionary of sentence indices to sentences
        dictionary of paragraph index to constituent sentence indices"""
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)

    pgs = tokenizer.paragraphs(tok_stream)
    pg_idx_to_sent_idx = dict()
    sent_idx_to_sent = dict()
    curr_sent_idx = 0
    curr_pg_idx = 0

    for pg in pgs:
        sent_idxs = []
        for _, sent in pg:
            curr_sent = list(filter(BIN_Token.is_understood, sent))
            curr_sent = tokenizer.normalized_text_from_tokens(curr_sent)
            if correct_spaces:
                curr_sent = tokenizer.correct_spaces(curr_sent)
            sent_idxs.append(curr_sent_idx)
            sent_idx_to_sent[curr_sent_idx] = curr_sent
            curr_sent_idx += 1
        pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
        curr_pg_idx += 1
    return pg_idx_to_sent_idx, sent_idx_to_sent
コード例 #5
0
ファイル: main.py プロジェクト: busla/Reynir
def query():
    """ Respond to a query string """

    q = request.form.get("q", "").strip()[0:_MAX_QUERY_LENGTH]
    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = get_json_bool(request, "autouppercase", True)
    result = dict()

    with SessionContext(commit=True) as session:

        toklist = list(
            tokenize(q,
                     enclosing_session=session,
                     auto_uppercase=q.islower() if auto_uppercase else False))
        actual_q = correct_spaces(" ".join(t.txt or "" for t in toklist))

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        # Try to parse and process as a query
        is_query = process_query(session, toklist, result)

    result["is_query"] = is_query
    result["q"] = actual_q

    return jsonify(result=result)
コード例 #6
0
ファイル: main.py プロジェクト: busla/Reynir
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    bindb = BIN_Db.get_db()

    with SessionContext(commit=True) as session:

        q = session.query(Person.name, Person.title, Person.article_url, Article.id) \
            .join(Article).join(Root) \
            .filter(Root.visible) \
            .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records

        for p in q:
            # Insert the name into the list if it's not already there,
            # or if the new title is longer than the previous one
            if p.name not in toplist or len(p.title) > len(toplist[p.name][0]):
                toplist[p.name] = (correct_spaces(p.title), p.article_url,
                                   p.id, bindb.lookup_name_gender(p.name))
                if len(toplist) >= limit:
                    # We now have as many names as we initially wanted: terminate the loop
                    break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted([
            dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
            for name, tu in toplist.items()
        ],
                      key=lambda x: strxfrm(x["name"]))
コード例 #7
0
ファイル: trigrams.py プロジェクト: thorunna/Greynir
 def spin_trigram(first):
     t1 = t2 = ""
     candidates = first
     sent = ""
     while candidates:
         sumfreq = sum(freq for _, freq in candidates)
         r = randint(0, sumfreq - 1)
         for t3, freq in candidates:
             if r < freq:
                 if not t3:
                     # End of sentence
                     candidates = []
                     break
                 if sent:
                     sent += " " + t3
                 else:
                     sent = t3
                 t1, t2 = t2, t3
                 q = session.execute(
                     "select t3, frequency from trigrams "
                     "where t1=:t1 and t2=:t2 order by frequency desc",
                     dict(t1=t1, t2=t2))
                 candidates = q.fetchall()
                 break
             r -= freq
     return correct_spaces(sent)
コード例 #8
0
ファイル: query.py プロジェクト: busla/Reynir
def append_answers(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = ArticleInfo(domain=p.domain,
                         uuid=p.id,
                         heading=p.heading,
                         ts=p.timestamp)
        rd[s][ai.uuid] = ai  # Add to a dict of UUIDs
コード例 #9
0
 def tidy_text(self):
     """ Return a [more] correctly spaced text representation of the sentence """
     if self.tree is None:
         # Not parsed (yet)
         txt = self.text
     else:
         # Use the terminal text representation - it's got fancy em/en-dashes and stuff
         txt = " ".join(t.text for t in self.terminals)
     return correct_spaces(txt)
コード例 #10
0
def dump(tokens):
    print("\n{1}\n{0} tokens:\n".format(
        len(tokens),
        tokenizer.correct_spaces(" ".join(t.txt for t in tokens if t.txt))))
    for token in tokens:
        err = token.error_description
        if err:
            print("{0}".format(token.txt))
            print("   {0}: {1}".format(token.error_code, err))
コード例 #11
0
ファイル: main.py プロジェクト: haukurb/Reynir
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    MAX_TITLE_LENGTH = 64

    with SessionContext(commit=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url,
                          Article.id).join(Article).join(Root).filter(
                              Root.visible).order_by(desc(Article.timestamp))
            [0:limit * 2]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                        p.title, toplist[p.name][0]):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name,
                     title=tu[0],
                     gender=tu[3],
                     url=tu[1],
                     uuid=tu[2]) for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
コード例 #12
0
ファイル: query.py プロジェクト: haukurb/Reynir
def append_answers(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = dict(domain=p.domain,
                  uuid=p.id,
                  heading=p.heading,
                  timestamp=p.timestamp,
                  ts=p.timestamp.isoformat()[0:16],
                  url=p.url)
        rd[s][p.id] = ai  # Add to a dict of UUIDs
コード例 #13
0
ファイル: query.py プロジェクト: vthorsteinsson/Reynir
def append_answers(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = dict(
            domain=p.domain,
            uuid=p.id,
            heading=p.heading,
            timestamp=p.timestamp,
            ts=p.timestamp.isoformat()[0:16],
            url=p.url,
        )
        rd[s][p.id] = ai  # Add to a dict of UUIDs
コード例 #14
0
ファイル: main.py プロジェクト: haukurb/Reynir
def query_api(version=1):
    """ Respond to a query string """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    if request.method == "GET":
        q = request.args.get("q", "")
    else:
        q = request.form.get("q", "")
    q = q.strip()[0:_MAX_QUERY_LENGTH]

    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = get_json_bool(request, "autouppercase", True)
    result = dict()
    ql = q.lower()

    if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES:
        result["valid"] = True
        result["qtype"] = "Special"
        result["q"] = q
        if ql in _SPECIAL_QUERIES:
            result["response"] = _SPECIAL_QUERIES[ql]
        else:
            result["response"] = _SPECIAL_QUERIES[ql + "?"]
    else:
        with SessionContext(commit=True) as session:

            toklist = list(
                tokenize_and_recognize(
                    q,
                    enclosing_session=session,
                    auto_uppercase=q.islower() if auto_uppercase else False,
                ))
            actual_q = correct_spaces(" ".join(t.txt or "" for t in toklist))

            if Settings.DEBUG:
                # Log the query string as seen by the parser
                print("Query is: '{0}'".format(actual_q))

            # Try to parse and process as a query
            is_query = process_query(session, toklist, result)

        result["valid"] = is_query
        result["q"] = actual_q

    return better_jsonify(**result)
コード例 #15
0
ファイル: query.py プロジェクト: haukurb/Reynir
def append_names(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd,
        assuming that the key is a person name """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = dict(domain=p.domain,
                  uuid=p.id,
                  heading=p.heading,
                  timestamp=p.timestamp,
                  ts=p.timestamp.isoformat()[0:16],
                  url=p.url)
        # Obtain the key within rd that should be updated with new
        # data. This may be an existing key, a new key or None if no
        # update is to be performed.
        s = name_key_to_update(rd, s)
        if s is not None:
            rd[s][p.id] = ai  # Add to a dict of UUIDs
コード例 #16
0
ファイル: query.py プロジェクト: vthorsteinsson/Reynir
def append_names(rd, q, prop_func):
    """ Iterate over query results and add them to the result dictionary rd,
        assuming that the key is a person name """
    for p in q:
        s = correct_spaces(prop_func(p))
        ai = dict(
            domain=p.domain,
            uuid=p.id,
            heading=p.heading,
            timestamp=p.timestamp,
            ts=p.timestamp.isoformat()[0:16],
            url=p.url,
        )
        # Obtain the key within rd that should be updated with new
        # data. This may be an existing key, a new key or None if no
        # update is to be performed.
        s = name_key_to_update(rd, s)
        if s is not None:
            rd[s][p.id] = ai  # Add to a dict of UUIDs
コード例 #17
0
ファイル: wmt.py プロジェクト: thorunna/Greynir
def main():

    try:
        # Read configuration file
        Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf"))
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        exit()

    with SessionContext(commit=False) as session:
        bef = datetime(2020, 7, 26, 0, 0, 1)
        aft = datetime(2020, 7, 27, 0, 0, 1)
        q = (session.query(
            Article.url, Article.timestamp, Article.heading,
            Article.tokens).filter(Article.timestamp > bef).filter(
                Article.timestamp < aft).order_by(Article.timestamp))
        items = list()
        for r in q.all():
            (url, ts, title, tokens) = r
            text = ""
            tokens = json.loads(tokens)
            if not tokens:
                continue
            # Paragraphs
            for p in tokens:
                # Sentences
                for s in p:
                    # Tokens
                    for t in s:
                        text += t["x"] + " "

            d = dict(url=url, timestamp=ts.isoformat(), title=title, text=text)
            d["text"] = correct_spaces(d["text"])
            items.append(d)
            # print(d)
            # print(text)
            # print("____________________________")

        print(json.dumps(items, ensure_ascii=False, sort_keys=True, indent=4))
コード例 #18
0
ファイル: query.py プロジェクト: vthorsteinsson/Reynir
def query_person_title(session, name):
    """ Return the most likely title for a person """
    rl = _query_person_titles(session, name)
    return correct_spaces(rl[0]["answer"]) if rl else ""
コード例 #19
0
 def tidy_text(self):
     """ Return a [more] correctly spaced text representation of the sentence """
     return correct_spaces(self.text)
コード例 #20
0
def gen_to_string(g):
    return tokenizer.correct_spaces(" ".join(t.txt for t in g if t.txt))
コード例 #21
0
def main() -> None:

    parser = argparse.ArgumentParser()
    parser.add_argument("infile",
                        nargs="?",
                        type=argparse.FileType("r"),
                        default=sys.stdin)
    parser.add_argument("--posfile", help="File with POS tags", required=False)
    parser.add_argument(
        "-w",
        "--word-spelling-error-rate",
        type=float,
        default=0.3,
        help="Error rate used for spelling of words.",
        required=False,
    )
    parser.add_argument("-r",
                        "--rule-chance-error-rate",
                        help="Chance for each rule to be applied",
                        default=0.9,
                        type=float)
    parser.add_argument("-p",
                        "--parse-online",
                        help="Parse sentence with Greynir if pos not provided",
                        type=bool,
                        default=True)
    parser.add_argument("--seed", default=1, type=int)
    parser.add_argument("-t", "--dont-detokenize", action="store_true")
    parser.add_argument("-n", "--nproc", default=1, type=int)
    parser.add_argument("-b", "--batch-size", default=1, type=int)
    args = parser.parse_args()

    error_generators = [
        DativitisErrorRule,
        MoodErrorRule,
        NounCaseErrorRule,
        SwapErrorRule,
        DuplicateWordsRule,
        SplitWordsRule,
        NoiseErrorRule,
        DeleteSpaceErrorRule,
    ]

    error_dataset = ErrorDataset(args.infile,
                                 args.posfile,
                                 args,
                                 error_generators=error_generators)

    error_loader = torch.utils.data.DataLoader(
        error_dataset,
        num_workers=args.nproc,
        worker_init_fn=worker_init_fn,
        batch_size=args.batch_size,
    )

    for error_batch in error_loader:
        for error_sentence in error_batch:
            if args.dont_detokenize:
                print(error_sentence)
            else:
                print(correct_spaces(error_sentence))
コード例 #22
0
ファイル: query.py プロジェクト: vthorsteinsson/Reynir
def query_entity_def(session, name):
    """ Return a single (best) definition of an entity """
    rl = _query_entity_definitions(session, name)
    return correct_spaces(rl[0]["answer"]) if rl else ""
コード例 #23
0
ファイル: query.py プロジェクト: busla/Reynir
def query_entity_def(session, name):
    """ Return a single (best) definition of an entity """
    rl = _query_entity_titles(session, name)
    return correct_spaces(rl[0][0]) if rl else ""
コード例 #24
0
ファイル: query.py プロジェクト: busla/Reynir
def query_person_title(session, name):
    """ Return the most likely title for a person """
    rl = _query_person_titles(session, name)
    return correct_spaces(rl[0][0]) if rl else ""
コード例 #25
0
def normalize(ex):
    ice, eng = ex["is"], ex["en"]
    ice = tokenizer.correct_spaces(ice)
    eng = NLTK_DETOK.detokenize(eng.split(" "))
    return {"is": ice, "en": eng}
コード例 #26
0
def test_correction():
    SENT = [
        (
            """Hann sagði: "Þú ert fífl"! Ég mótmælti því.""",
            """Hann sagði: „Þú ert fífl“! Ég mótmælti því.""",
        ),
        (
            """Hann sagði: Þú ert "fífl"! Ég mótmælti því.""",
            """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""",
        ),
        (
            """Hann sagði: Þú ert «fífl»! Ég mótmælti því.""",
            """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""",
        ),
        (
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
        ),
        (
            """Hann sagði: Þú ert ´fífl´! Hringdu í 7771234.""",
            """Hann sagði: Þú ert ‚fífl‘! Hringdu í 7771234."""
        ),
        (
            """Hann sagði: Þú ert (´fífl´)! Ég mótmælti því.""",
            """Hann sagði: Þú ert (´fífl‘)! Ég mótmælti því.""",  # !!!
        ),
        (
            """Hann "gaf" mér 10,780.65 dollara.""",
            """Hann „gaf“ mér 10,780.65 dollara."""
        ),
        (
            """Hann "gaf" mér €10,780.65.""",
            """Hann „gaf“ mér €10,780.65.""",
        ),
        (
            """Hann "gaf" mér €10.780,65.""",
            """Hann „gaf“ mér €10.780,65.""",
        ),
    ]
    SENT_KLUDGY_ORDINALS_MODIFY = [
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""",
        ),
        (
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
            """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""",
        ),
    ]
    SENT_KLUDGY_ORDINALS_TRANSLATE = [
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""",
        ),
        (
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
        ),
    ]
    SENT_CONVERT_TELNOS = [
        (
            """Hann sagði: Þú ert ´fífl´! Hringdu í 7771234.""",
            """Hann sagði: Þú ert ‚fífl‘! Hringdu í 777-1234."""
        ),
        (
            """Hann sagði: Þú ert ´fífl´! Hringdu í 777 1234.""",
            """Hann sagði: Þú ert ‚fífl‘! Hringdu í 777-1234."""
        ),
    ]
    SENT_CONVERT_NUMBERS = [
        (
            """Hann "gaf" mér 10,780.65 dollara.""",
            """Hann „gaf“ mér 10.780,65 dollara."""
        ),
        (
            """Hann "gaf" mér €10,780.65.""",
            """Hann „gaf“ mér €10.780,65."""
        ),
        (
            """Hann "gaf" mér €10.780,65.""",
            """Hann „gaf“ mér €10.780,65.""",
        ),
    ]
    for sent, correct in SENT:
        s = t.tokenize(sent)
        txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt))
        assert txt == correct
    for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY:
        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY)
        txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt))
        assert txt == correct
    for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE:
        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE)
        txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt))
        assert txt == correct
    for sent, correct in SENT_CONVERT_TELNOS:
        s = t.tokenize(sent, convert_telnos=True)
        txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt))
        assert txt == correct
    for sent, correct in SENT_CONVERT_NUMBERS:
        s = t.tokenize(sent, convert_numbers=True)
        txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt))
        assert txt == correct