Beispiel #1
0
 def try_to_help(cls, query, result):
     """ Attempt to help the user in the case of a failed query,
         based on lemmas in the query string """
     # Collect a set of lemmas that occur in the query string
     lemmas = set()
     with BIN_Db.get_db() as db:
         for token in query.lower().split():
             if token.isalpha():
                 m = db.meanings(token)
                 if not m:
                     # Try an uppercase version, just in case (pun intended)
                     m = db.meanings(token.capitalize())
                 if m:
                     lemmas |= set(mm.stofn.lower() for mm in m)
     # Collect a list of potential help text functions from the query modules
     help_text_funcs = []
     for lemma in lemmas:
         help_text_funcs.extend([
             (lemma, help_text_func)
             for help_text_func in cls._help_texts.get(lemma, [])
         ])
     if help_text_funcs:
         # Found at least one help text func matching a lemma in the query
         # Select a function at random and invoke it with the matched
         # lemma as a parameter
         lemma, help_text_func = random.choice(help_text_funcs)
         result["answer"] = result["voice"] = help_text_func(lemma)
         result["valid"] = True
Beispiel #2
0
def handle_plain_text(q):
    ql = q.query_lower.rstrip("?")

    for rx in _MY_NAME_IS_REGEXES:
        m = re.search(rx, ql)
        if m:
            break

    if not m:
        return False

    name = m.group(1).strip()
    if not name:
        return False

    if name.startswith("ekki "):
        return False

    with BIN_Db.get_db() as bdb:
        fn = name.split()[0].title()
        gender = bdb.lookup_name_gender(fn)
        a = _RESPONSES[gender].format(fn)

    voice = a.replace(",", "")
    q.set_answer(dict(answer=a), a, voice)
    q.set_qtype(_INTRO_QTYPE)

    return True
Beispiel #3
0
def _capital_query(country, q):
    """ Generate answer to question concerning a country capital. """

    # Get country code
    cc = isocode_for_country_name(country)
    if not cc:
        logging.warning("No CC for country {0}".format(country))
        return False

    # Find capital city, given the country code
    capital = capital_for_cc(cc)
    if not capital:
        return False

    # Use the Icelandic name for the city
    ice_cname = icelandic_city_name(capital["name_ascii"])

    # Look up genitive country name for voice description
    bres = BIN_Db().lookup_genitive(country, cat="no")
    country_gen = bres[0].ordmynd if bres else country

    answer = ice_cname
    response = dict(answer=answer)
    voice = "Höfuðborg {0} er {1}".format(country_gen, answer)

    q.set_answer(response, answer, voice)
    q.set_key("Höfuðborg {0}".format(country_gen))
    q.set_context(dict(subject=ice_cname))

    return True
Beispiel #4
0
def dump_tokens(limit):
    """ Iterate through parsed articles and print a list
        of tokens and their matched terminals """

    dtd = dict()
    with BIN_Db.get_db() as db, SessionContext(commit=True) as session:
        # Iterate through the articles
        q = (session.query(Article).filter(Article.tree != None).order_by(
            Article.timestamp))
        if limit is None:
            q = q.all()
        else:
            q = q[0:limit]
        for a in q:
            print(
                "\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}"
                .format(a))
            tree = TreeTokenList()
            tree.load(a.tree)
            for ix, toklist in tree.token_lists():
                print("\nSentence {0}:".format(ix))
                at_start = True
                for t in toklist:
                    if t.tokentype == "WORD":
                        wrd = t.token[1:-1]
                        td = dtd.get(t.terminal)
                        if td is None:
                            td = TerminalDescriptor(t.terminal)
                            dtd[t.terminal] = td
                        stem = td.stem(db, wrd, at_start)
                        at_start = False
                        print("    {0} {1} {2}".format(wrd, stem, t.terminal))
                    else:
                        print("    {0.token} {0.cat} {0.terminal}".format(t))
Beispiel #5
0
def _addr2nom(address):
    """ Convert location name to nominative form """
    # TODO: Implement more intelligently
    # This is a tad simplistic and mucks up some things,
    # e.g. "Ráðhús Reykjavíkur" becomes "Ráðhús Reykjavík"
    words = address.split()
    nf = []
    for w in words:
        bin_res = BIN_Db().lookup_nominative(w)
        if not bin_res and not w.islower():
            # Try lowercase form
            bin_res = BIN_Db().lookup_nominative(w.lower())
        if bin_res:
            nf.append(bin_res[0].ordmynd)
        else:
            nf.append(w)
    return " ".join(nf)
Beispiel #6
0
def to_dative(np, *, meaning_filter_func=None):
    """ Return the noun phrase after casting it from nominative to dative case """
    with BIN_Db.get_db() as db:
        return _to_case(
            np,
            db.lookup_word,
            db.cast_to_dative,
            meaning_filter_func=meaning_filter_func,
        )
Beispiel #7
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url, Article.id)
            .join(Article)
            .join(Root)
            .filter(Root.visible)
            .order_by(desc(Article.timestamp))[
                0 : limit * 2
            ]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                    p.title, toplist[p.name][0]
                ):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
                for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Beispiel #8
0
def to_accusative(np: str,
                  *,
                  meaning_filter_func: MeaningFilterFunc = None) -> str:
    """ Return the noun phrase after casting it from nominative to accusative case """
    with BIN_Db.get_db() as db:
        return _to_case(
            np,
            db.lookup_word,
            db.cast_to_accusative,
            meaning_filter_func=meaning_filter_func,
        )
Beispiel #9
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url, Article.id)
            .join(Article)
            .join(Root)
            .filter(Root.visible)
            .order_by(desc(Article.timestamp))[
                0 : limit * 2
            ]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                    p.title, toplist[p.name][0]
                ):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
                for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Beispiel #10
0
def nom2dat(w):
    """ Look up the dative form of a noun in BÍN. """
    if not w:
        return ""

    def sort_by_preference(m_list):
        """ Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3 """
        return sorted(m_list,
                      key=lambda m: "2" in m.beyging or "3" in m.beyging)

    with BIN_Db().get_db() as db:
        return db.cast_to_dative(w, meaning_filter_func=sort_by_preference)
Beispiel #11
0
 def tagset(self, word, at_sentence_start = False):
     """ Return a list of (probability, tag) tuples for the given word """
     toklist = list(parse_tokens(" ".join(word)))
     token = toklist[0]
     w = word[0]
     if token.kind == TOK.WORD and token.val is None:
         try:
             with BIN_Db.get_db() as db:
                 w, m = db.lookup_word(token.txt, at_sentence_start)
         except Exception as e:
             w, m = token.txt, []
         token = TOK.Word(w, m)
     return self._ngram_tagger.tag_single_token(token)
Beispiel #12
0
 def tagset(self, word, at_sentence_start=False):
     """ Return a list of (probability, tag) tuples for the given word """
     toklist = list(parse_tokens(" ".join(word)))
     token = toklist[0]
     w = word[0]
     if token.kind == TOK.WORD and token.val is None:
         try:
             with BIN_Db.get_db() as db:
                 w, m = db.lookup_word(token.txt, at_sentence_start)
         except Exception as e:
             w, m = token.txt, []
         token = TOK.Word(w, m)
     return self._ngram_tagger.tag_single_token(token)
Beispiel #13
0
def top_authors(days=_TOP_AUTHORS_PERIOD, session=None):
    end = datetime.utcnow()
    start = end - timedelta(days=days)
    authors = BestAuthorsQuery.period(start,
                                      end,
                                      enclosing_session=session,
                                      min_articles=10)[:20]

    authresult = list()
    with BIN_Db.get_db() as bindb:
        for a in authors:
            name = a[0]
            gender = bindb.lookup_name_gender(name)
            if gender == "hk":  # Skip unnamed authors (e.g. "Ritstjórn Vísis")
                continue
            perc = round(float(a[4]), 2)
            authresult.append({"name": name, "gender": gender, "perc": perc})

    return authresult[:10]
Beispiel #14
0
def lookup_best_word(word):
    """ Look up word in BÍN, pick right one acc. to a criterion. """
    with BIN_Db().get_db() as db:

        def nouns_only(bin_meaning):
            return bin_meaning.ordfl in ("kk", "kvk", "hk")

        res = list(filter(nouns_only, db.lookup_nominative(word)))
        if not res:
            # Try with uppercase first char
            capw = word.capitalize()
            res = list(filter(nouns_only, db.lookup_nominative(capw)))
            if not res:
                return None

        # OK, we have one or more matching nouns
        if len(res) == 1:
            m = res[0]
        else:
            # TODO: Pick best result
            m = res[0]  # For now

        wid = m.utg

        # TODO: If more than one declesion form possible (e.g. gen. björns vs. bjarnar)
        # we should also list such variations
        def sort_by_preference(m_list):
            # Filter out words that don't have the same "utg" i.e. word ID as
            # the one we successfully looked up in BÍN
            mns = list(filter(lambda w: w.utg == wid, m_list))
            # Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3
            return sorted(m_list,
                          key=lambda m: "2" in m.beyging or "3" in m.beyging)

        # Look up all cases of the word in BÍN
        nom = m.stofn
        acc = db.cast_to_accusative(nom,
                                    meaning_filter_func=sort_by_preference)
        dat = db.cast_to_dative(nom, meaning_filter_func=sort_by_preference)
        gen = db.cast_to_genitive(nom, meaning_filter_func=sort_by_preference)
        return nom, acc, dat, gen
Beispiel #15
0
def _mynameis_handler(q: Query, ql: str) -> bool:
    """ Handle queries of the form "Ég heiti X", store this information. """
    for rx in _MY_NAME_IS_REGEXES:
        m = re.search(rx, ql)
        if m:
            break
    if m:
        name = m.group(1).strip()
        if not name:
            return False

        # Clean up name string
        name = name.split(" og ")[0]  # "ég heiti X og blablabla"
        name = name.split(" hvað ")[0]  # "ég heiti X hvað heitir þú"

        # Handle "ég heiti ekki X"
        components = name.split()
        if components[0] == "ekki":
            q.set_answer(*gen_answer("Hvað heitirðu þá?"))
            return True

        # Get first name, look up gender for a gender-tailored response
        with BIN_Db.get_db() as bdb:
            fn = components[0].title()
            gender = bdb.lookup_name_gender(fn) or "hk"
            answ = _MY_NAME_IS_RESPONSES[gender].format(fn)

        # Save this info about user to query data table
        if q.client_id:
            qdata = dict(full=name.title(), first=fn, gender=gender)
            q.set_client_data("name", qdata)

        # Generate answer
        voice = answ.replace(",", "")
        q.set_answer(dict(answer=answ), answ, voice)
        q.query_is_command()
        return True

    return False
Beispiel #16
0
def test_casting():
    """ Test functions to cast words in nominative case to other cases """
    from reynir.bindb import BIN_Db
    db = BIN_Db()

    assert db.cast_to_accusative("") == ""
    assert db.cast_to_dative("") == ""
    assert db.cast_to_genitive("") == ""

    assert db.cast_to_accusative("xxx") == "xxx"
    assert db.cast_to_dative("xxx") == "xxx"
    assert db.cast_to_genitive("xxx") == "xxx"

    assert db.cast_to_accusative("maðurinn") == "manninn"
    assert db.cast_to_dative("maðurinn") == "manninum"
    assert db.cast_to_genitive("maðurinn") == "mannsins"

    assert db.cast_to_accusative("mennirnir") == "mennina"
    assert db.cast_to_dative("mennirnir") == "mönnunum"
    assert db.cast_to_genitive("mennirnir") == "mannanna"

    assert db.cast_to_accusative("framkvæma") == "framkvæma"
    assert db.cast_to_dative("framkvæma") == "framkvæma"
    assert db.cast_to_genitive("framkvæma") == "framkvæma"

    assert db.cast_to_accusative("stóru") == "stóru"
    assert db.cast_to_dative("stóru") == "stóru"
    assert db.cast_to_genitive("stóru") == "stóru"

    assert db.cast_to_accusative("stóri") == "stóra"
    assert db.cast_to_dative("stóri") == "stóra"
    assert db.cast_to_genitive("stóri") == "stóra"

    assert db.cast_to_accusative("kattarhestur") == "kattarhest"
    assert db.cast_to_dative("kattarhestur") == "kattarhesti"
    assert db.cast_to_genitive("kattarhestur") == "kattarhests"

    assert db.cast_to_accusative("Kattarhestur") == "Kattarhest"
    assert db.cast_to_dative("Kattarhestur") == "Kattarhesti"
    assert db.cast_to_genitive("Kattarhestur") == "Kattarhests"

    f = lambda mm: [m for m in mm if "2" not in m.beyging]
    assert db.cast_to_accusative("fjórir", meaning_filter_func=f) == "fjóra"
    assert db.cast_to_dative("fjórir", meaning_filter_func=f) == "fjórum"
    assert db.cast_to_genitive("fjórir", meaning_filter_func=f) == "fjögurra"

    assert db.cast_to_accusative("Suður-Afríka") == "Suður-Afríku"
    assert db.cast_to_dative("Suður-Afríka") == "Suður-Afríku"
    assert db.cast_to_genitive("Suður-Afríka") == "Suður-Afríku"

    assert db.cast_to_accusative("Vestur-Þýskaland") == "Vestur-Þýskaland"
    assert db.cast_to_dative("Vestur-Þýskaland") == "Vestur-Þýskalandi"
    assert db.cast_to_genitive("Vestur-Þýskaland") == "Vestur-Þýskalands"

    f = lambda mm: sorted(mm, key=lambda m: "2" in m.beyging or "3" in m.beyging)
    assert db.cast_to_accusative("Kópavogur", meaning_filter_func=f) == "Kópavog"
    assert db.cast_to_dative("Kópavogur", meaning_filter_func=f) == "Kópavogi"
    assert db.cast_to_genitive("Kópavogur", meaning_filter_func=f) == "Kópavogs"
Beispiel #17
0
def test():

    with BIN_Db.get_db() as db:
        c = Corrector(db)

        txts = [
            """
        FF er flokkur með rasisku ívafi og tilhneygjingu til að einkavinavæða alla fjölmiðla
        Íslands og færa þar með elítunni að geta ein haft áhrif á skoðanamyndandi áhri í
        fjölmiðlaheiminum, er ekki viðbúið að svona flokkur gamgi til samstarf við íhaldið
        eftir kosningar en ekki þessa vondu félagshyggjuflokka
            """,
            """
        fæ alveg hræðileg drauma vegna fyrri áfalla og það hjálpar mér að ná góðum svef og þar með
        betri andlegri lýðan og líka til að auka matarlist. Tek samt skýrt fram að ég hef bæði
        missnotað kannabis og ekki. Hef engan áhuga á að vera undir áhrifum kannabis alla dag.
        Mikil munur á að nota og missnota !
            """,
            """
        Bæði , lyf gegn áfengissyki (leiða) , mér hefur ekki leiðst mikið seinustu 30 ár. Gegn
        Taugaveiklun, konan hamrar á mér alla daga , skærur hennar eru langar og strangar. En ef ég fæ
        eina pípu og gríp gitarinn má hún tuða í mér klukkutímum saman.Ég er bæði rólegur og læri hratt
        á gítarinn, eftir 10 ára hjónaband er ég bara ótrúlega heill og stefni hátt. Ég og gitarinn erum
        orðnir samvaxnir. Auðvitað stefnum við á skilnað og þá mun ég sakna skalaæfinganna.
            """,
            """
        biddu nu hæg - var Kvennalistinn eins malefnis hreyfing. Hvað attu við - ef þu telur malefnið
        hafa verið eitt hvert var það? Kannski leikskola fyrir öll börn? Sömu laun fyrir sömu störf?
        Að borgarskipulag tæki mið af þörfum beggja kynja? Að kynjagleraugu væru notuð við gerð
        fjarlaga? Að þjoðfelagið opnaði augun fyrir kynferðsofbeldinu og sifjaspellum? (hvorutveggja
        sagt aðeins viðgangast i utlöndum). Þetta eru aðeins örfa dæmi um malefni sem brunnu a okkur
        og við börðumst fyrir. Ekki ertu i alvöru að tala framlag okkur niður. Tæplega
        telurðu það EITT malefni þo að i grunninn hafi baratta okkar sem stoðum að Kvennaframboðinu
        og -listanum gengið ut a að ,,betri,, helmingur þjoðarinnar öðlast - ekki bara i orði heldur
        einnig a borði - sömu rettindi og raðandi helmingurinn
            """,
            """
        Salvör ekki standa i að reyna að klora yfir mistök þin. Reynsluheimur kvenna visar að sjalsögðu
        til þess að helmingur mannkynsins - -konur - er olikur hinum helmingnum bæði sökum lffræðilegs munar og
        þess að þær eru gerðar að konum (sb de Beauvoir) þe fra frumbernsku er drengjum hrosað fyrir annað en
        stulkum og væntingar foreldra eru aðrar til dætra en sona og auk þess er ætlast til að dætur læri af mæðrum en synir af
        feðrum. Það er þetta sem gerir konur - helming mannkynsins - frabrugðna körlum sem hafa fra örofi alda verið
        ,,raðandi,, kynið. Það var gegn þvi orettlæti að reynsluheimur kvenna speglaðist ekki i politiskum akvörðunum sem við
        sem stofnaði Kvennafranboðið og - listann börðumst gegn - a öllum vigstöðvum. Að skilgreina barattu okkar
        Kvennalistans - fyrir rettindum halfrar þjoðarinnar til að skapa ,,rettlatara samfelag,, - sem eins mals flokk er
        fjarstæða.
            """,
        ]

        def linebreak(txt, margin=80, left_margin=0):
            """ Return a nicely column-formatted string representation of the given text,
                where each line is not longer than the given margin (if possible).
                A left margin can be optionally added, as a sequence of spaces.
                The lines are joined by newlines ('\n') but there is no trailing
                newline. """
            result = []
            line: List[str] = []
            len_line = 0
            for wrd in txt.split():
                if len_line + 1 + len(wrd) > margin:
                    result.append(" ".join(line))
                    line = []
                    len_line = 0
                line.append(wrd)
                len_line += 1 + len(wrd)
            if line:
                result.append(" ".join(line))
            return "\n".join(" " * left_margin + line for line in result)

        t0 = time.time()

        for t in txts:
            print("\nOriginal:\n")
            print(linebreak(t, left_margin=8))
            print("\nCorrected:\n")
            print(linebreak(c.correct_text(t), left_margin=8))

        t1 = time.time()
        print("\nTotal time: {0:.2f} seconds".format(t1 - t0))
Beispiel #18
0
        )

    except socket_error as e:
        if e.errno == errno.EADDRINUSE:  # Address already in use
            logging.error(
                "Reynir is already running at host {0}:{1}".format(
                    Settings.HOST, Settings.PORT
                )
            )
            sys.exit(1)
        else:
            raise

    finally:
        ArticleProxy.cleanup()
        BIN_Db.cleanup()

else:

    # Suppress information log messages from Werkzeug
    werkzeug_log = logging.getLogger("werkzeug")
    if werkzeug_log:
        werkzeug_log.setLevel(logging.WARNING)

    # Log our startup
    log_str = "Reynir instance starting with host={0}:{1}, db_hostname={2} on Python {3}".format(
        Settings.HOST,
        Settings.PORT,
        Settings.DB_HOSTNAME,
        sys.version.replace("\n", " "),
    )
Beispiel #19
0
def handle_plain_text(q):
    """ Handle a plain text query, contained in the q parameter
        which is an instance of the query.Query class.
        Returns True if the query was handled, and in that case
        the appropriate properties on the Query instance have
        been set, such as the answer and the query type (qtype).
        If the query is not recognized, returns False. """
    ql = q.query_lower.rstrip("?")

    # Timezone being asked about
    tz = None
    # Whether user asked for the time in a particular location
    specific_desc = None

    if ql in _TIME_QUERIES:
        # Use location to determine time zone
        tz = timezone4loc(q.location, fallback="IS")
    elif ql.startswith("hvað er klukkan á ") or ql.startswith(
            "hvað er klukkan í "):
        # Query about the time in a particular location, i.e. country or city
        loc = ql[18:]  # Cut away question prefix, leaving only placename
        # Capitalize each word in country/city name
        loc = capitalize_placename(loc)
        # Look up nominative
        # This only works for single-word city/country names found
        # in BÍN and could be improved (e.g. fails for "Nýju Jórvík")
        bin_res = BIN_Db().lookup_nominative(loc)
        words = [m.stofn for m in bin_res]
        words.append(
            loc)  # In case it's not in BÍN (e.g. "New York", "San José")

        # Check if any word is a recognised country or city name
        for w in words:
            cc = isocode_for_country_name(w)
            if cc and cc in country_timezones:
                # Look up country timezone
                # Use the first timezone although some countries have more than one
                # The timezone list returned by pytz is ordered by "dominance"
                tz = country_timezones[cc][0]
            else:
                # It's not a country name, look up in city database
                info = lookup_city_info(w)
                if info:
                    top = info[0]
                    location = (top.get("lat_wgs84"), top.get("long_wgs84"))
                    tz = timezone4loc(location)
            if tz:
                # We have a timezone
                break

        # "Klukkan í Lundúnum er" - Used for voice answer
        specific_desc = "{0} er".format(ql[8:])

        # Beautify query by capitalizing the country/city name
        q.set_beautified_query("{0}{1}?".format(q.beautified_query[:18], loc))

    # We have a timezone. Return formatted answer.
    if tz:
        now = datetime.now(timezone(tz))

        desc = specific_desc or "Klukkan er"

        # Create displayable answer
        answer = "{0:02}:{1:02}".format(now.hour, now.minute)
        # A detailed response object is usually a list or a dict
        response = dict(answer=answer)
        # A voice answer is a plain string that will be
        # passed as-is to a voice synthesizer
        voice = "{0} {1}:{2:02}.".format(desc, now.hour, now.minute)

        q.set_qtype(_TIME_QTYPE)
        q.set_key(tz)  # Query key is the timezone
        q.set_answer(response, answer, voice)
        return True

    return False
Beispiel #20
0
    "\"Önnu kveið # fyrir skóladeginum.\"\n")

print("\nUpphaflegur texti: '{0}'".format(txt))
for pg in rc.check(txt, split_paragraphs=True):
    for sent in pg:
        display_annotations(sent)
    print("---")

sys.exit(0)

import time

from reynir_correct.spelling import Corrector
from reynir.bindb import BIN_Db

with BIN_Db.get_db() as db:
    c = Corrector(db)  # type: Corrector


def test(c, word):
    t0 = time.time()
    result = list(c.subs(word))
    valid = [r for r in result if r in c]
    t1 = time.time()
    print("Word: {0}, combinations: {1}, time {2:.3f} secs".format(
        word, len(result), t1 - t0))
    print(result)
    print(valid)


test(c, "hæstarréttarlögmaður")
Beispiel #21
0
def QGeoSubject(node, params, result):
    n = capitalize_placename(result._text)
    bin_res = BIN_Db().lookup_nominative(n)
    res = bin_res[0].stofn if bin_res else n
    result.subject = res
Beispiel #22
0
def recognize_entities(token_stream, enclosing_session=None, token_ctor=TOK):

    """ Parse a stream of tokens looking for (capitalized) entity names
        The algorithm implements N-token lookahead where N is the
        length of the longest entity name having a particular initial word.
        Adds a named entity recognition layer on top of the
        reynir.bintokenizer.tokenize() function.

    """

    # Token queue
    tq = []
    # Phrases we're considering. Note that an entry of None
    # indicates that the accumulated phrase so far is a complete
    # and valid known entity name.
    state = defaultdict(list)
    # Entitiy definition cache
    ecache = dict()
    # Last name to full name mapping ('Clinton' -> 'Hillary Clinton')
    lastnames = dict()

    with BIN_Db.get_db() as db, SessionContext(
        session=enclosing_session, commit=True, read_only=True
    ) as session:

        def fetch_entities(w, fuzzy=True):
            """ Return a list of entities matching the word(s) given,
                exactly if fuzzy = False, otherwise also as a starting word(s) """
            try:
                q = session.query(Entity.name, Entity.verb, Entity.definition)
                if fuzzy:
                    q = q.filter(Entity.name.like(w + " %") | (Entity.name == w))
                else:
                    q = q.filter(Entity.name == w)
                return q.all()
            except OperationalError as e:
                logging.warning("SQL error in fetch_entities(): {0}".format(e))
                return []

        def query_entities(w):
            """ Return a list of entities matching the initial word given """
            e = ecache.get(w)
            if e is None:
                ecache[w] = e = fetch_entities(w)
            return e

        def lookup_lastname(lastname):
            """ Look up a last name in the lastnames registry,
                eventually without a possessive 's' at the end, if present """
            fullname = lastnames.get(lastname)
            if fullname is not None:
                # Found it
                return fullname
            # Try without a possessive 's', if present
            if lastname.endswith("s"):
                return lastnames.get(lastname[0:-1])
            # Nope, no match
            return None

        def flush_match():
            """ Flush a match that has been accumulated in the token queue """
            if len(tq) == 1 and lookup_lastname(tq[0].txt) is not None:
                # If single token, it may be the last name of a
                # previously seen entity or person
                return token_or_entity(tq[0])
            # Reconstruct original text behind phrase
            ename = " ".join([t.txt for t in tq])
            # We don't include the definitions in the token - they should be looked up
            # on the fly when processing or displaying the parsed article
            return token_ctor.Entity(ename)

        def token_or_entity(token):
            """ Return a token as-is or, if it is a last name of a person
                that has already been mentioned in the token stream by full name,
                refer to the full name """
            assert token.txt[0].isupper()
            tfull = lookup_lastname(token.txt)
            if tfull is None:
                # Not a last name of a previously seen full name
                return token
            if tfull.kind != TOK.PERSON:
                # Return an entity token with no definitions
                # (this will eventually need to be looked up by full name when
                # displaying or processing the article)
                return token_ctor.Entity(token.txt)
            # Return the full name meanings
            return token_ctor.Person(token.txt, tfull.val)

        try:

            while True:

                token = next(token_stream)

                if not token.txt:  # token.kind != TOK.WORD:
                    if state:
                        if None in state:
                            yield flush_match()
                        else:
                            yield from tq
                        tq = []
                        state = defaultdict(list)
                    yield token
                    continue

                # Look for matches in the current state and build a new state
                newstate = defaultdict(list)
                w = token.txt  # Original word

                def add_to_state(slist, entity):
                    """ Add the list of subsequent words to the new parser state """
                    wrd = slist[0] if slist else None
                    rest = slist[1:]
                    newstate[wrd].append((rest, entity))

                if w in state:
                    # This matches an expected token
                    tq.append(token)  # Add to lookahead token queue
                    # Add the matching tails to the new state
                    for sl, entity in state[w]:
                        add_to_state(sl, entity)
                    # Update the lastnames mapping
                    fullname = " ".join([t.txt for t in tq])
                    parts = fullname.split()
                    # If we now have 'Hillary Rodham Clinton',
                    # make sure we delete the previous 'Rodham' entry
                    for p in parts[1:-1]:
                        if p in lastnames:
                            del lastnames[p]
                    if parts[-1][0].isupper():
                        # 'Clinton' -> 'Hillary Rodham Clinton'
                        lastnames[parts[-1]] = token_ctor.Entity(fullname)
                else:
                    # Not a match for an expected token
                    if state:
                        if None in state:
                            # We have an accumulated match, but if the next token
                            # is an uppercase word without a BÍN meaning, we
                            # append it to the current entity regardless.
                            # This means that 'Charley Lucknow' is handled as a single
                            # new entity name even if 'Charley' already exists
                            # as an entity.
                            while w and w[0].isupper() and not token.val:
                                # Append to the accumulated token queue, which will
                                # be squashed to a single token in flush_match()
                                tq.append(token)
                                token = next(token_stream)
                                w = token.txt
                            # Flush the already accumulated match
                            yield flush_match()
                        else:
                            yield from tq
                        tq = []

                    # Add all possible new states for entity names
                    # that could be starting
                    weak = True
                    cnt = 1
                    upper = w and w[0].isupper()
                    parts = None

                    if upper and " " in w:
                        # For all uppercase phrases (words, entities, persons),
                        # maintain a map of last names to full names
                        parts = w.split()
                        lastname = parts[-1]
                        # Clinton -> Hillary [Rodham] Clinton
                        if lastname[0].isupper():
                            # Look for Icelandic patronyms/matronyms
                            _, m = db.lookup_word(lastname, False)
                            if m and any(mm.fl in {"föð", "móð"} for mm in m):
                                # We don't store Icelandic patronyms/matronyms
                                # as surnames
                                pass
                            else:
                                lastnames[lastname] = token

                    if token.kind == TOK.WORD and upper and w not in Abbreviations.DICT:
                        if " " in w:
                            # w may be a person name with more than one embedded word
                            # parts is assigned in the if statement above
                            cnt = len(parts)
                        elif not token.val or ("-" in token.val[0].stofn):
                            # No BÍN meaning for this token, or the meanings
                            # were constructed by concatenation (indicated by a hyphen
                            # in the stem)
                            weak = False  # Accept single-word entity references
                        # elist is a list of Entity instances
                        elist = query_entities(w)
                    else:
                        elist = []

                    if elist:
                        # This word might be a candidate to start an entity reference
                        candidate = False
                        for e in elist:
                            # List of subsequent words in entity name
                            sl = e.name.split()[cnt:]
                            if sl:
                                # Here's a candidate for a longer entity reference
                                # than we already have
                                candidate = True
                            if sl or not weak:
                                add_to_state(sl, e)
                        if weak and not candidate:
                            # Found no potential entity reference longer than this token
                            # already is - and we have a BÍN meaning for it:
                            # Abandon the effort
                            assert not newstate
                            assert not tq
                            yield token_or_entity(token)
                        else:
                            # Go for it: Initialize the token queue
                            tq = [token]
                    else:
                        # Not a start of an entity reference: simply yield the token
                        assert not tq
                        if upper:
                            # Might be a last name referring to a full name
                            yield token_or_entity(token)
                        else:
                            yield token

                # Transition to the new state
                state = newstate

        except StopIteration:
            # Token stream is exhausted
            pass

        # Yield an accumulated match if present
        if state:
            if None in state:
                yield flush_match()
            else:
                yield from tq
            tq = []

    # print("\nEntity cache:\n{0}".format("\n".join("'{0}': {1}".format(k, v) for k, v in ecache.items())))
    # print("\nLast names:\n{0}".format("\n".join("{0}: {1}".format(k, v) for k, v in lastnames.items())))

    assert not tq
Beispiel #23
0
            port=Settings.PORT,
            debug=Settings.DEBUG,
            use_reloader=not ptvsd_attached,
            extra_files=extra_files,
        )
    except socket_error as e:
        if e.errno == errno.EADDRINUSE:  # Address already in use
            logging.error(
                "Greynir web app is already running at host {0}:{1}".format(
                    Settings.HOST, Settings.PORT))
            sys.exit(1)
        else:
            raise
    finally:
        ArticleProxy.cleanup()
        BIN_Db.cleanup()

else:
    app.config["PRODUCTION"] = True

    # Suppress information log messages from Werkzeug
    werkzeug_log = logging.getLogger("werkzeug")
    if werkzeug_log:
        werkzeug_log.setLevel(logging.WARNING)

    # Log our startup
    log_str = ("Greynir instance starting with "
               "host={0}:{1}, db_host={2}:{3} on Python {4}".format(
                   Settings.HOST,
                   Settings.PORT,
                   Settings.DB_HOSTNAME,
Beispiel #24
0
def wordfreq():
    """ Return word frequency chart data for a given time period. """
    resp = dict(err=True)

    # Create datetime objects from query string args
    try:
        date_fmt = "%Y-%m-%d"
        date_from = datetime.strptime(request.args.get("date_from"), date_fmt)
        date_to = datetime.strptime(request.args.get("date_to"), date_fmt)
    except Exception as e:
        logging.warning("Failed to parse date arg: {0}".format(e))
        return better_jsonify(**resp)

    # Words parameter should be one or more word lemmas (w. optional category)
    warg = request.args.get("words")
    if not warg:
        return better_jsonify(**resp)

    # Split on comma or whitespace, limit to max 6 words
    warg = warg.strip().replace("  ", " ").replace(",", " ")
    words = [w.strip() for w in warg.split()][:6]
    # Word categories can be specified thus: "maður:kk"
    words = [tuple(w.split(":")) for w in words]

    with BIN_Db.get_db() as db:

        def cat4word(w):
            _, meanings = db.lookup_word(w, auto_uppercase=True)
            if meanings:
                # Give precedence to lemmas, e.g. interpret "reima" as
                # verb rather than gen. pl. of fem. noun "reim"
                lemmas = list(filter(lambda x: x.stofn == w, meanings))
                return lemmas[0].ordfl if lemmas else meanings[0].ordfl
            return "hk"

        # Get word category (ordfl) for each word, if needed
        valid_cats = ["kk", "kvk", "hk", "lo", "so"]
        for i, w in enumerate(words):
            if len(w) < 2 or w[1] not in valid_cats:
                words[i] = (w[0], cat4word(w[0]))

    colors = list(_LINE_COLORS)

    # Generate date labels
    now = datetime.utcnow()
    delta = date_to - date_from
    labels = [date_from + timedelta(days=i) for i in range(delta.days + 1)]
    with changedlocale(category="LC_TIME"):
        labels = [
            l.strftime("%-d. %b")
            if l.year == now.year else l.strftime("%-d. %b %Y") for l in labels
        ]

    # More human readble description of word categories
    CAT_DESC = {
        "kk": "kk. no.",
        "kvk": "kvk. no.",
        "hk": "hk. no.",
        "lo": "lo.",
        "so": "so.",
    }

    # Create datasets for front-end chart
    with SessionContext(commit=False) as session:
        data = dict(labels=labels, datasets=[])
        for w in words:
            # Look up frequency of word for the given period
            res = WordFrequencyQuery.fetch(w[0],
                                           w[1],
                                           date_from,
                                           date_to,
                                           enclosing_session=session)
            # Generate data and config for chart
            label = "{0} ({1})".format(w[0], CAT_DESC.get(w[1]))
            ds = dict(label=label, fill=False, lineTension=0)
            ds["borderColor"] = ds["backgroundColor"] = colors.pop(0)
            ds["data"] = [r[1] for r in res]
            data["datasets"].append(ds)

    # Create response
    resp["err"] = False
    resp["data"] = data
    # Update word list client-side
    resp["words"] = ", ".join([":".join(w) for w in words])

    return better_jsonify(**resp)