Exemple #1
0
def annotate(token_stream, auto_uppercase):
    """ Look up word forms in the BIN word database. If auto_uppercase
        is True, change lower case words to uppercase if it looks likely
        that they should be uppercase. """

    at_sentence_start = False

    with BIN_Db.get_db() as db:

        # Consume the iterable source in wlist (which may be a generator)
        for t in token_stream:
            if t.kind != TOK.WORD:
                # Not a word: relay the token unchanged
                yield t
                if t.kind == TOK.S_BEGIN or (t.kind == TOK.PUNCTUATION
                                             and t.txt == ':'):
                    at_sentence_start = True
                elif t.kind != TOK.PUNCTUATION and t.kind != TOK.ORDINAL:
                    at_sentence_start = False
                continue
            if t.val is None:
                # Look up word in BIN database
                w, m = db.lookup_word(t.txt, at_sentence_start, auto_uppercase)
                # Yield a word tuple with meanings
                yield TOK.Word(w, m)
            else:
                # Already have a meaning, which probably needs conversion
                # from a bare tuple to a BIN_Meaning
                yield TOK.Word(t.txt, list(map(BIN_Meaning._make, t.val)))
            # No longer at sentence start
            at_sentence_start = False
Exemple #2
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    num = 0
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Greynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 4
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Greynir token ahead
                    #print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Exemple #3
0
def disambiguate_phrases(token_stream):
    """ Parse a stream of tokens looking for common ambiguous multiword phrases
        (i.e. phrases that have a well known very likely interpretation but
        other extremely uncommon ones are also grammatically correct).
        The algorithm implements N-token lookahead where N is the
        length of the longest phrase.
    """

    tq = []  # Token queue
    state = defaultdict(list)  # Phrases we're considering
    pdict = AmbigPhrases.DICT  # The phrase dictionary

    try:

        while True:

            token = next(token_stream)

            if token.kind != TOK.WORD:
                # Not a word: no match; yield the token queue
                if tq:
                    yield from tq
                    tq = []
                # Discard the previous state, if any
                if state:
                    state = defaultdict(list)
                # ...and yield the non-matching token
                yield token
                continue

            # Look for matches in the current state and build a new state
            newstate = defaultdict(list)
            w = token.txt.lower()

            def add_to_state(slist, index):
                """ Add the list of subsequent words to the new parser state """
                wrd = slist[0]
                rest = slist[1:]
                newstate[wrd].append((rest, index))

            if w in state:
                # This matches an expected token:
                # go through potential continuations
                tq.append(token)  # Add to lookahead token queue
                token = None
                for sl, ix in state[w]:
                    if not sl:
                        # No subsequent word: this is a complete match
                        # Discard meanings of words in the token queue that are not
                        # compatible with the category list specified
                        cats = AmbigPhrases.get_cats(ix)
                        for t, cat in zip(tq, cats):
                            # Yield a new token with fewer meanings for each original token in the queue
                            if cat == "fs":
                                # Handle prepositions specially, since we may have additional
                                # preps defined in Main.conf that don't have fs meanings in BÍN
                                w = t.txt.lower()
                                yield TOK.Word(
                                    t.txt,
                                    [BIN_Meaning(w, 0, "fs", "alm", w, "-")])
                            else:
                                yield TOK.Word(
                                    t.txt,
                                    [m for m in t.val if m.ordfl == cat])

                        # Discard the state and start afresh
                        if newstate:
                            newstate = defaultdict(list)
                        w = ""
                        tq = []
                        # Note that it is possible to match even longer phrases
                        # by including a starting phrase in its entirety in
                        # the static phrase dictionary
                        break
                    add_to_state(sl, ix)
            elif tq:
                # This does not continue a started phrase:
                # yield the accumulated token queue
                yield from tq
                tq = []

            if w in pdict:
                # This word potentially starts a new phrase
                for sl, ix in pdict[w]:
                    # assert sl
                    add_to_state(sl, ix)
                if token:
                    tq.append(token)  # Start a lookahead queue with this token
            elif token:
                # Not starting a new phrase: pass the token through
                yield token

            # Transition to the new state
            state = newstate

    except StopIteration:
        # Token stream is exhausted
        pass

    # Yield any tokens remaining in queue
    yield from tq
Exemple #4
0
def parse_static_phrases(token_stream, auto_uppercase):
    """ Parse a stream of tokens looking for static multiword phrases
        (i.e. phrases that are not affected by inflection).
        The algorithm implements N-token lookahead where N is the
        length of the longest phrase.
    """
    tq = []  # Token queue
    state = defaultdict(list)  # Phrases we're considering
    pdict = StaticPhrases.DICT  # The phrase dictionary
    try:

        while True:

            token = next(token_stream)
            if token.txt is None:  # token.kind != TOK.WORD:
                # Not a word: no match; discard state
                if tq:
                    yield from tq
                    tq = []
                if state:
                    state = defaultdict(list)
                yield token
                continue

            # Look for matches in the current state and build a new state
            newstate = defaultdict(list)
            wo = token.txt  # Original word
            w = wo.lower()  # Lower case
            if wo == w:
                wo = w

            def add_to_state(slist, index):
                """ Add the list of subsequent words to the new parser state """
                wrd = slist[0]
                rest = slist[1:]
                newstate[wrd].append((rest, index))

            # First check for original (uppercase) word in the state, if any;
            # if that doesn't match, check the lower case
            wm = None
            if wo is not w and wo in state:
                wm = wo
            elif w in state:
                wm = w

            if wm:
                # This matches an expected token:
                # go through potential continuations
                tq.append(token)  # Add to lookahead token queue
                token = None
                for sl, ix in state[wm]:
                    if not sl:
                        # No subsequent word: this is a complete match
                        # Reconstruct original text behind phrase
                        plen = StaticPhrases.get_length(ix)
                        while len(tq) > plen:
                            # We have extra queued tokens in the token queue
                            # that belong to a previously seen partial phrase
                            # that was not completed: yield them first
                            yield tq.pop(0)
                        w = " ".join([t.txt for t in tq])
                        # Add the entire phrase as one 'word' to the token queue
                        yield TOK.Word(
                            w,
                            map(BIN_Meaning._make,
                                StaticPhrases.get_meaning(ix)))
                        # Discard the state and start afresh
                        newstate = defaultdict(list)
                        w = wo = ""
                        tq = []
                        # Note that it is possible to match even longer phrases
                        # by including a starting phrase in its entirety in
                        # the static phrase dictionary
                        break
                    add_to_state(sl, ix)
            elif tq:
                yield from tq
                tq = []

            wm = None
            if auto_uppercase and len(wo) == 1 and w is wo:
                # If we are auto-uppercasing, leave single-letter lowercase
                # phrases alone, i.e. 'g' for 'gram' and 'm' for 'meter'
                pass
            elif wo is not w and wo in pdict:
                wm = wo
            elif w in pdict:
                wm = w

            # Add all possible new states for phrases that could be starting
            if wm:
                # This word potentially starts a phrase
                for sl, ix in pdict[wm]:
                    if not sl:
                        # Simple replace of a single word
                        if tq:
                            yield from tq
                            tq = []
                        # Yield the replacement token
                        yield TOK.Word(
                            token.txt,
                            map(BIN_Meaning._make,
                                StaticPhrases.get_meaning(ix)))
                        newstate = defaultdict(list)
                        token = None
                        break
                    add_to_state(sl, ix)
                if token:
                    tq.append(token)
            elif token:
                yield token

            # Transition to the new state
            state = newstate

    except StopIteration:
        # Token stream is exhausted
        pass

    # Yield any tokens remaining in queue
    yield from tq
Exemple #5
0
def parse_phrases_2(token_stream):
    """ Parse a stream of tokens looking for phrases and making substitutions.
        Second pass
    """

    token = None
    try:

        # Maintain a one-token lookahead
        token = next(token_stream)

        # Maintain a set of full person names encountered
        names = set()

        at_sentence_start = False

        while True:
            next_token = next(token_stream)
            # Make the lookahead checks we're interested in

            # Check for [number] [currency] and convert to [amount]
            if token.kind == TOK.NUMBER and (next_token.kind == TOK.WORD or
                                             next_token.kind == TOK.CURRENCY):

                # Preserve the case of the number, if available
                # (milljónir, milljóna, milljónum)
                cases = token.val[1]
                genders = token.val[2]
                cur = None

                if next_token.kind == TOK.WORD:
                    # Try to find a currency name
                    cur = match_stem_list(next_token, CURRENCIES)
                    if cur is None and next_token.txt.isupper():
                        # Might be an ISO abbrev (which is not in BÍN)
                        cur = CURRENCIES.get(next_token.txt)
                        if not cases:
                            cases = list(ALL_CASES)
                        if not genders:
                            # Try to find a correct gender for the ISO abbrev,
                            # or use neutral as a default
                            genders = [
                                CURRENCY_GENDERS.get(next_token.txt, "hk")
                            ]
                    if cur is not None:
                        # Use the case and gender information from the currency name
                        if not cases:
                            cases = all_cases(next_token)
                        if not genders:
                            genders = all_genders(next_token)
                elif next_token.kind == TOK.CURRENCY:
                    # Already have an ISO identifier for a currency
                    cur = next_token.val[0]
                    # Use the case and gender information from the currency name
                    # if no such information was given with the number itself
                    cases = cases or next_token.val[1]
                    genders = genders or next_token.val[2]

                if cur is not None:
                    # Create an amount
                    # Use the case and gender information from the number, if any
                    token = TOK.Amount(token.txt + " " + next_token.txt, cur,
                                       token.val[0], cases, genders)
                    # Eat the currency token
                    next_token = next(token_stream)

            # Logic for human names

            def stems(tok, categories, given_name=False):
                """ If the token denotes a given name, return its possible
                    interpretations, as a list of PersonName tuples (name, case, gender).
                    If first_name is True, we omit from the list all name forms that
                    occur in the disallowed_names section in the configuration file. """
                if tok.kind != TOK.WORD or not tok.val:
                    return None
                if at_sentence_start and tok.txt in NOT_NAME_AT_SENTENCE_START:
                    # Disallow certain person names at the start of sentences,
                    # such as 'Annar'
                    return None
                # Set up the names we're not going to allow
                dstems = DisallowedNames.STEMS if given_name else {}
                # Look through the token meanings
                result = []
                for m in tok.val:
                    if m.fl in categories and "ET" in m.beyging:
                        # If this is a given name, we cut out name forms
                        # that are frequently ambiguous and wrong, i.e. "Frá" as accusative
                        # of the name "Frár", and "Sigurð" in the nominative.
                        c = case(m.beyging)
                        if m.stofn not in dstems or c not in dstems[m.stofn]:
                            # Note the stem ('stofn') and the gender from the word type ('ordfl')
                            result.append(
                                PersonName(name=m.stofn,
                                           gender=m.ordfl,
                                           case=c))
                return result if result else None

            def has_category(tok, categories):
                """ Return True if the token matches a meaning with any of the given categories """
                if tok.kind != TOK.WORD or not tok.val:
                    return False
                return any(m.fl in categories for m in tok.val)

            def has_other_meaning(tok, category):
                """ Return True if the token can denote something besides a given name """
                if tok.kind != TOK.WORD or not tok.val:
                    return True
                # Return True if there is a different meaning, not a given name
                return any(m.fl != category for m in tok.val)

            # Check for person names
            def given_names(tok):
                """ Check for Icelandic person name (category 'ism') """
                if tok.kind != TOK.WORD or not tok.txt[0].isupper():
                    # Must be a word starting with an uppercase character
                    return None
                return stems(tok, {"ism"}, given_name=True)

            # Check for surnames
            def surnames(tok):
                """ Check for Icelandic patronym (category 'föð') or matronym (category 'móð') """
                if tok.kind != TOK.WORD or not tok.txt[0].isupper():
                    # Must be a word starting with an uppercase character
                    return None
                return stems(tok, {"föð", "móð"})

            # Check for unknown surnames
            def unknown_surname(tok):
                """ Check for unknown (non-Icelandic) surnames """
                # Accept (most) upper case words as a surnames
                if tok.kind != TOK.WORD:
                    return False
                if not tok.txt[0].isupper():
                    # Must start with capital letter
                    return False
                if has_category(tok, {"föð", "móð"}):
                    # This is a known surname, not an unknown one
                    return False
                # Allow single-letter abbreviations, but not multi-letter
                # all-caps words (those are probably acronyms)
                return len(tok.txt) == 1 or not tok.txt.isupper()

            def given_names_or_middle_abbrev(tok):
                """ Check for given name or middle abbreviation """
                gnames = given_names(tok)
                if gnames is not None:
                    return gnames
                if tok.kind != TOK.WORD:
                    return None
                wrd = tok.txt
                if wrd.startswith('['):
                    # Abbreviation: Cut off the brackets & trailing period, if present
                    if wrd.endswith('.]'):
                        wrd = wrd[1:-2]
                    else:
                        # This is probably a C. which had its period cut off as a sentence ending...
                        wrd = wrd[1:-1]
                if len(wrd) > 2 or not wrd[0].isupper():
                    if wrd not in {"van", "de", "den", "der", "el",
                                   "al"}:  # "of" was here
                        # Accept "Thomas de Broglie", "Ruud van Nistelroy"
                        return None
                # One or two letters, capitalized: accept as middle name abbrev,
                # all genders and cases possible
                return [PersonName(name=wrd, gender=None, case=None)]

            def compatible(pn, npn):
                """ Return True if the next PersonName (np) is compatible with the one we have (p) """
                if npn.gender and (npn.gender != pn.gender):
                    return False
                if npn.case and (npn.case != pn.case):
                    return False
                return True

            if token.kind == TOK.WORD and token.val and token.val[
                    0].fl == "nafn":
                # Convert a WORD with fl="nafn" to a PERSON with the correct gender, in all cases
                gender = token.val[0].ordfl
                token = TOK.Person(token.txt, [
                    PersonName(token.txt, gender, case) for case in ALL_CASES
                ])
                gn = None
            else:
Exemple #6
0
 def convert_to_num(token):
     if multiplier is not None:
         token = TOK.Number(token.txt, multiplier,
                            all_cases(token),
                            all_genders(token))
     return token
Exemple #7
0
def parse_phrases_1(token_stream):
    """ Parse numbers and amounts """

    with BIN_Db.get_db() as db:

        token = None
        try:

            # Maintain a one-token lookahead
            token = next(token_stream)
            while True:
                next_token = next(token_stream)

                # Logic for numbers that are partially or entirely
                # written out in words

                def number(tok):
                    """ If the token denotes a number, return that number - or None """
                    if tok.txt.lower() == "áttu":
                        # Do not accept 'áttu' (stem='átta', no kvk) as a number
                        return None
                    return match_stem_list(
                        tok,
                        MULTIPLIERS,
                        filter_func=lambda m: m.ordfl in NUMBER_CATEGORIES)

                # Check whether we have an initial number word
                multiplier = number(token) if token.kind == TOK.WORD else None

                # Check for [number] 'hundred|thousand|million|billion'
                while (token.kind == TOK.NUMBER or multiplier is not None) \
                    and next_token.kind == TOK.WORD:

                    multiplier_next = number(next_token)

                    def convert_to_num(token):
                        if multiplier is not None:
                            token = TOK.Number(token.txt, multiplier,
                                               all_cases(token),
                                               all_genders(token))
                        return token

                    if multiplier_next is not None:
                        # Retain the case of the last multiplier, except
                        # if it is possessive (eignarfall) and the previous
                        # token had a case ('hundruðum milljarða' is dative,
                        # not possessive)
                        next_case = all_cases(next_token)
                        next_gender = all_genders(next_token)
                        if "ef" in next_case:
                            # We may have something like 'hundruðum milljarða':
                            # use the case and gender of 'hundruðum', not 'milljarða'
                            next_case = all_cases(token) or next_case
                            next_gender = all_genders(token) or next_gender
                        token = convert_to_num(token)
                        token = TOK.Number(token.txt + " " + next_token.txt,
                                           token.val[0] * multiplier_next,
                                           next_case, next_gender)
                        # Eat the multiplier token
                        next_token = next(token_stream)
                    elif next_token.txt in AMOUNT_ABBREV:
                        # Abbreviations for ISK amounts
                        # For abbreviations, we do not know the case,
                        # but we try to retain the previous case information if any
                        token = convert_to_num(token)
                        token = TOK.Amount(
                            token.txt + " " + next_token.txt,
                            "ISK",
                            token.val[0] *
                            AMOUNT_ABBREV[next_token.txt],  # Number
                            token.val[1],
                            token.val[2])  # Cases and gender
                        next_token = next(token_stream)
                    else:
                        # Check for [number] 'percent'
                        percentage = match_stem_list(next_token, PERCENTAGES)
                        if percentage is not None:
                            token = convert_to_num(token)
                            token = TOK.Percent(
                                token.txt + " " + next_token.txt, token.val[0],
                                all_cases(next_token), all_genders(next_token))
                            # Eat the percentage token
                            next_token = next(token_stream)
                        else:
                            break

                    multiplier = None

                # Check for currency name doublets, for example
                # 'danish krona' or 'british pound'
                if token.kind == TOK.WORD and next_token.kind == TOK.WORD:
                    nat = match_stem_list(token, NATIONALITIES)
                    if nat is not None:
                        cur = match_stem_list(next_token, CURRENCIES)
                        if cur is not None:
                            if (nat, cur) in ISO_CURRENCIES:
                                # Match: accumulate the possible cases
                                iso_code = ISO_CURRENCIES[(nat, cur)]
                                # Filter the possible cases by considering adjectives
                                # having a strong declination (indefinite form) only
                                token = TOK.Currency(
                                    token.txt + " " + next_token.txt, iso_code,
                                    all_common_cases(
                                        token, next_token, lambda m:
                                        (m.ordfl == "lo" and "SB" in m.beyging
                                         )), [CURRENCY_GENDERS[cur]])
                                next_token = next(token_stream)

                # Check for composites:
                # 'stjórnskipunar- og eftirlitsnefnd'
                # 'viðskipta- og iðnaðarráðherra'
                # 'marg-ítrekaðri'
                if token.kind == TOK.WORD and \
                    next_token.kind == TOK.PUNCTUATION and next_token.txt == COMPOSITE_HYPHEN:

                    og_token = next(token_stream)
                    if og_token.kind != TOK.WORD or (og_token.txt != "og" and
                                                     og_token.txt != "eða"):
                        # Incorrect prediction: make amends and continue
                        handled = False
                        if og_token.kind == TOK.WORD:
                            composite = token.txt + "-" + og_token.txt
                            if token.txt.lower() in ADJECTIVE_PREFIXES:
                                # hálf-opinberri, marg-ítrekaðri
                                token = TOK.Word(composite, [
                                    m for m in og_token.val
                                    if m.ordfl == "lo" or m.ordfl == "ao"
                                ])
                                next_token = next(token_stream)
                                handled = True
                            else:
                                # Check for Vestur-Þýskaland, Suður-Múlasýsla (which are in BÍN in their entirety)
                                m = db.meanings(composite)
                                if m:
                                    # Found composite in BÍN: return it as a single token
                                    token = TOK.Word(composite, m)
                                    next_token = next(token_stream)
                                    handled = True
                        if not handled:
                            yield token
                            # Put a normal hyphen instead of the composite one
                            token = TOK.Punctuation(HYPHEN)
                            next_token = og_token
                    else:
                        # We have 'viðskipta- og'
                        final_token = next(token_stream)
                        if final_token.kind != TOK.WORD:
                            # Incorrect: unwind
                            yield token
                            yield TOK.Punctuation(HYPHEN)  # Normal hyphen
                            token = og_token
                            next_token = final_token
                        else:
                            # We have 'viðskipta- og iðnaðarráðherra'
                            # Return a single token with the meanings of
                            # the last word, but an amalgamated token text.
                            # Note: there is no meaning check for the first
                            # part of the composition, so it can be an unknown word.
                            txt = token.txt + "- " + og_token.txt + \
                                " " + final_token.txt
                            token = TOK.Word(txt, final_token.val)
                            next_token = next(token_stream)

                # Yield the current token and advance to the lookahead
                yield token
                token = next_token

        except StopIteration:
            pass

        # Final token (previous lookahead)
        if token:
            yield token
Exemple #8
0
                                found_name = True
                                break

                # If this is not a "strong" name, backtrack from recognizing it.
                # A "weak" name is (1) at the start of a sentence; (2) only one
                # word; (3) that word has a meaning that is not a name;
                # (4) the name has not been seen in a full form before;
                # (5) not on a 'well known name' list.

                weak = at_sentence_start and (' ' not in w) and not patronym and \
                    not found_name and (has_other_meaning(token, "ism") and w not in NamePreferences.SET)

                if not weak:
                    # Return a person token with the accumulated name
                    # and the intersected set of possible cases
                    token = TOK.Person(w, gn)

            # Yield the current token and advance to the lookahead
            yield token

            if token.kind == TOK.S_BEGIN or (token.kind == TOK.PUNCTUATION
                                             and token.txt == ':'):
                at_sentence_start = True
            elif token.kind != TOK.PUNCTUATION and token.kind != TOK.ORDINAL:
                at_sentence_start = False
            token = next_token

    except StopIteration:
        pass

    # Final token (previous lookahead)
Exemple #9
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose=False) as bp:  # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0  # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print(
                        "Parsed sentence of length {0} with {1} combinations{2}"
                        .format(
                            slen, num, "\n" +
                            (" ".join(s[1]
                                      for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num**(1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(
                        num_parses=num, err_index=err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(tokens=toklist,
                  tok_num=len(toklist),
                  num_sent=num_sent,
                  num_parsed_sent=num_parsed_sent,
                  avg_ambig_factor=(total_ambig /
                                    total_tokens) if total_tokens > 0 else 1.0)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=result)
Exemple #10
0
def analyze():
    """ Analyze text from a given URL """

    url = request.form.get("url", "").strip()
    t0 = time.time()

    if url.startswith("http:") or url.startswith("https:"):
        # Scrape the URL, tokenize the text content and return the token list
        toklist = list(process_url(url))
    else:
        # Tokenize the text entered as-is and return the token list
        toklist = list(tokenize(url))

    tok_time = time.time() - t0

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0

    sent_begin = 0
    bp = BIN_Parser()

    t0 = time.time()

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            # Parse the accumulated sentence
            err_index = None
            try:
                forest = bp.go(sent)
            except ParseError as e:
                forest = None
                # Obtain the index of the offending token
                err_index = e.token_index()
            num = 0 if forest is None else Parser.num_combinations(forest)
            print("Parsed sentence of length {0} with {1} combinations{2}".
                  format(
                      slen, num,
                      "\n" + " ".join(s[1]
                                      for s in sent) if num >= 100 else ""))
            if num > 0:
                num_parsed_sent += 1
                # Calculate the 'ambiguity factor'
                ambig_factor = num**(1 / slen)
                # Do a weighted average on sentence length
                total_ambig += ambig_factor * slen
                total_tokens += slen
            # Mark the sentence beginning with the number of parses
            # and the index of the offending token, if an error occurred
            toklist[sent_begin] = TOK.Begin_Sentence(num_parses=num,
                                                     err_index=err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)

    parse_time = time.time() - t0

    result = dict(tokens=toklist,
                  tok_time=tok_time,
                  tok_num=len(toklist),
                  parse_time=parse_time,
                  num_sent=num_sent,
                  num_parsed_sent=num_parsed_sent,
                  avg_ambig_factor=(total_ambig /
                                    total_tokens) if total_tokens > 0 else 1.0)

    # Dump the tokens to a text file for inspection
    # dump_tokens_to_file("txt", toklist)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=result)