Beispiel #1
0
    def _raw_conditional_interval(self, token, context, backed_off):
        """Internal version of the conditional probability interval method."""

        match = None
        backoff_token = None

        for i in self._iter_matching_tokens(context, backed_off):
            if i.token == token:
                match = i
            if i.token == self.backoff:
                backoff_token = i
            full_count = i.b + i.l

        if match is not None:
            # If the token was found in the ngrams, report its interval
            return create_interval(match.b, match.l, full_count)
        elif backoff_token is not None:
            # Otherwise, back-off the model and report the backed-off interval
            # within the probability mass assigned to back-off
            backoff_interval = create_interval(backoff_token.b,
                                               backoff_token.l, full_count)
            backoff_subinterval = self._raw_conditional_interval(
                token, context[1:], context[0])
            return select_subinterval(backoff_interval, backoff_subinterval)
        else:
            raise Exception('Impossible sentence.')
Beispiel #2
0
def encode(conditional_interval, sequence, verbose=False):
    """
    Encode a sequence into an exact interval using the supplied "conditional
    subinterval" function.
    """

    interval = create_interval(0, 1)

    for i in range(len(sequence)):
        if verbose: print(sequence[i])
        interval = select_subinterval(
            interval, conditional_interval(sequence[i], sequence[:i]))

    return interval
Beispiel #3
0
    def _raw_next(self, search_interval, context, backed_off):
        """Internal version of the next token method."""

        tokens = tuple(self._iter_matching_tokens(context, backed_off))

        # Find correct scaled interval
        full_count = tokens[-1].b + tokens[-1].l
        base = sympy.floor(search_interval.b * full_count)
        end = sympy.ceiling(
            (search_interval.b + search_interval.l) * full_count)
        length = end - base

        def interval_bs(tokens, base, end):
            """
            Find using binary search a token whose counts are a superinterval of
            [base, end].
            """
            imin = 0
            imax = len(tokens) - 1

            while imin <= imax:
                imid = round((imin + imax) / 2)

                if (tokens[imid].b <= base
                        and tokens[imid].b + tokens[imid].l >= base + length):
                    return imid
                elif tokens[imid].b + tokens[imid].l <= base:
                    imin = imid + 1
                else:
                    imax = imid - 1

        i = interval_bs(tokens, base, end)

        if i is None:
            # No token can be found
            return None
        else:
            # We have found a token -- standard or back-off
            token = tokens[i]
            token_interval = create_interval(token.b, token.l, full_count)
            scaled_search_interval = find_ratio(search_interval,
                                                token_interval)

            if token.token == self.backoff:
                return self._raw_next(scaled_search_interval, context[1:],
                                      context[0])

            else:
                return NextSymbolSearchResult(token.token,
                                              scaled_search_interval)
Beispiel #4
0
# Invent a sentence
text = "Hey!  What the f**k is going on here?"
token_strings = normalise_and_explode_tokens(text2token_strings(text))
token_indices = tuple(map(index.s2i, token_strings))

print(text)
print()
print(" ".join(token_strings))
print()
print(token_indices)
print()

# Get the next token after "is" given some intervals
context = token_indices[:9]

intervals = (create_interval(0, 1,
                             1000000), create_interval(345246, 56, 1000000),
             create_interval(5465477, 322, 10000000),
             create_interval(23432566, 21, 100000000),
             create_interval(10000000000 - 1000000, 1, 10000000000))

context_str = " ".join(map(index.i2s, context[-(n - 1):]))

for interval in intervals:
    print(
        "Next token given the context \"{context_str}\" and interval {interval}:"
        .format(**locals()))
    next = lm.next(interval, context)
    print(next)

    if next is not None:
        print(index.i2s(next[0]))
Beispiel #5
0
offset = 0

print("n: {n}".format(**locals()))
print("start: {start}".format(**locals()))
print("end: {end}".format(**locals()))
print("beta: {beta}".format(**locals()))
print("gamma: {gamma}".format(**locals()))
print("offset: {offset}".format(**locals()))
print()

# Load language model
lm = bindb.BinDBLM(
    "/Users/kkom/Desktop/bindb-normalised/counts-consistent-tables", n, start,
    end, beta, gamma, offset)

# Create an interval
interval = create_interval(sympy.Rational(3, 7),
                           sympy.Rational(1, sympy.Pow(10, 200)))

print("Decoding: " + str(interval))
print()

sequence = decode(lm.next, interval)
print()

# Load index
with open("/Users/kkom/Desktop/bindb-normalised/index", "r") as f:
    index = bindb.BinDBIndex(f)

print("Decoded to: " + " ".join(map(index.i2s, sequence)))