Example #1
0
def rh_encoder ( encodingSpec, modelName, *args ):

    tokenize, hash, bits = parseEncodingSpec( encodingSpec )

    model = GLOBALS.get( modelName )
    assert model in MODELS, "model must be one of %s, got %s" % ( formatGlobalNames( MODELS ), modelName )

    encode = model( tokenize, hash, bits, *args )
    
    return toBytes | cmap(ord) | changeWordSize(8, bits) | cmap(encode)
Example #2
0
def ngramPlusOne(n):
    """
    >>> list( range(5) > ngramPlusOne( 1 ) )
    [((0,), 1), ((1,), 2), ((2,), 3), ((3,), 4)]
    >>> list( range(5) > ngramPlusOne( 2 ) )
    [((0, 1), 2), ((1, 2), 3), ((2, 3), 4)]
    """
    return ngram(n + 1) | cmap(lambda gram: (gram[:-1], gram[-1]))
Example #3
0
def ngramPlusOne ( n ):
    """
    >>> list( range(5) > ngramPlusOne( 1 ) )
    [((0,), 1), ((1,), 2), ((2,), 3), ((3,), 4)]
    >>> list( range(5) > ngramPlusOne( 2 ) )
    [((0, 1), 2), ((1, 2), 3), ((2, 3), 4)]
    """
    return ngram( n + 1 ) | cmap( lambda gram: ( gram[ :-1 ], gram[ -1 ] ) )
Example #4
0
def rh_decoder(encodingSpec):
    tokenize, hash, bits = parseEncodingSpec(encodingSpec)
    return toBytes | tokenize | cmap(truncateHash(
        hash, bits)) | changeWordSize(bits, 8) | cmap(chr)
Example #5
0
    @coroutine
    def tokenizer(target):
        value = []
        while True:
            byte = yield
            value.append(byte)
            if byte in stopBytes:
                target.send("".join(value))
                value = []

    return tokenizer


words = streamTokenizer( " \n" ) \
        | cfilter( lambda token: token not in ( " ", "\n" ) ) \
        | cmap( lambda token: token.strip() + ' ' )
appendTo(TOKENIZERS)(words)

lines = appendTo(TOKENIZERS)(streamTokenizer("\n"))
words2 = appendTo(TOKENIZERS)(streamTokenizer(" \n.,;?!"))
words3 = appendTo(TOKENIZERS)(
    streamTokenizer(" \n.,;?!")
    | cmap(lambda word: word[:-1] + ' '
           if word.endswith("\n") and not word == "\n" else word))

MODELS = []


@appendTo(MODELS)
def markov(tokenize, hash, bits, corpusFilename, order=1, abridged=None):
    truncatedHash = truncateHash(hash, bits)
 def encoderFactory():
     encode = encodeFactory()
     return toBytes | cmap(ord) | changeWordSize(8, bits) | cmap(encode)
    @coroutine
    def tokenizer(target):
        value = []
        while True:
            byte = yield
            value.append(byte)
            if byte in stopBytes:
                target.send("".join(value))
                value = []

    return tokenizer

words = streamTokenizer( " \n" ) \
        | cfilter( lambda token: token not in ( " ", "\n" ) ) \
        | cmap( lambda token: token.strip() + ' ' )
appendTo(TOKENIZERS)(words)

asciiwords = streamTokenizer( " \n" ) \
        | cmap( lambda token: "".join(b for b in token if 0x20 <= ord(b) <= 0x7e) ) \
        | cfilter( lambda token: token not in ( " ", "\n" ) ) \
        | cmap( lambda token: token.strip() + ' ' )
appendTo(TOKENIZERS)(asciiwords)

lines = appendTo(TOKENIZERS)(streamTokenizer("\n"))
words2 = appendTo(TOKENIZERS)(streamTokenizer(" \n.,;?!"))
words3 = appendTo(TOKENIZERS)(
    streamTokenizer(" \n.,;?!")
    | cmap(lambda word: word[:-1] + ' '
           if word.endswith("\n") and not word == "\n" else word))
Example #8
0
def rh_decoder ( encodingSpec ):
    tokenize, hash, bits = parseEncodingSpec( encodingSpec )
    return toBytes | tokenize | cmap( truncateHash( hash, bits ) ) | changeWordSize( bits, 8 ) | cmap( chr )
Example #9
0
def streamTokenizer ( stopBytes ):
    "Return a tokenizer coroutine which accumulates bytes until it sees a byte in stopBytes"
    @coroutine
    def tokenizer ( target ):
        value = []
        while True:
            byte = yield
            value.append( byte )
            if byte in stopBytes:
                target.send( "".join( value ) )
                value = []
    return tokenizer

words = streamTokenizer( " \n" ) \
        | cfilter( lambda token: token not in ( " ", "\n" ) ) \
        | cmap( lambda token: token.strip() + ' ' )
appendTo(TOKENIZERS)( words )

asciiwords = streamTokenizer( " \n" ) \
        | cmap( lambda token: "".join(b for b in token if 0x20 <= ord(b) <= 0x7e) ) \
        | cfilter( lambda token: token not in ( " ", "\n" ) ) \
        | cmap( lambda token: token.strip() + ' ' )
appendTo(TOKENIZERS)( asciiwords )

lines  = appendTo(TOKENIZERS)( streamTokenizer( "\n" ) )
words2 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" ) )
words3 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" )
                               | cmap( lambda word: word[:-1]+' ' if word.endswith("\n") and not word == "\n" else word ) )


MODELS = []
Example #10
0
    "Return a tokenizer coroutine which accumulates bytes until it sees a byte in stopBytes"
    @coroutine
    def tokenizer ( target ):
        value = []
        while True:
            byte = yield
            value.append( byte )
            if byte in stopBytes:
                target.send( "".join( value ) )
                value = []
    return tokenizer


words = streamTokenizer( " \n" ) \
        | cfilter( lambda token: token not in ( " ", "\n" ) ) \
        | cmap( lambda token: token.strip() + ' ' )
appendTo(TOKENIZERS)( words )


lines  = appendTo(TOKENIZERS)( streamTokenizer( "\n" ) )
words2 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" ) )
words3 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" )
                               | cmap( lambda word: word[:-1]+' ' if word.endswith("\n") and not word == "\n" else word ) )


MODELS = []

@appendTo(MODELS)
def markov ( tokenize, hash, bits, corpusFilename, order=1, abridged=None ):
    truncatedHash = truncateHash( hash, bits )
    corpusTokens  = list( tokenize < readTextFile( corpusFilename ) )