def rh_encoder ( encodingSpec, modelName, *args ): tokenize, hash, bits = parseEncodingSpec( encodingSpec ) model = GLOBALS.get( modelName ) assert model in MODELS, "model must be one of %s, got %s" % ( formatGlobalNames( MODELS ), modelName ) encode = model( tokenize, hash, bits, *args ) return toBytes | cmap(ord) | changeWordSize(8, bits) | cmap(encode)
def ngramPlusOne(n): """ >>> list( range(5) > ngramPlusOne( 1 ) ) [((0,), 1), ((1,), 2), ((2,), 3), ((3,), 4)] >>> list( range(5) > ngramPlusOne( 2 ) ) [((0, 1), 2), ((1, 2), 3), ((2, 3), 4)] """ return ngram(n + 1) | cmap(lambda gram: (gram[:-1], gram[-1]))
def ngramPlusOne ( n ): """ >>> list( range(5) > ngramPlusOne( 1 ) ) [((0,), 1), ((1,), 2), ((2,), 3), ((3,), 4)] >>> list( range(5) > ngramPlusOne( 2 ) ) [((0, 1), 2), ((1, 2), 3), ((2, 3), 4)] """ return ngram( n + 1 ) | cmap( lambda gram: ( gram[ :-1 ], gram[ -1 ] ) )
def rh_decoder(encodingSpec): tokenize, hash, bits = parseEncodingSpec(encodingSpec) return toBytes | tokenize | cmap(truncateHash( hash, bits)) | changeWordSize(bits, 8) | cmap(chr)
@coroutine def tokenizer(target): value = [] while True: byte = yield value.append(byte) if byte in stopBytes: target.send("".join(value)) value = [] return tokenizer words = streamTokenizer( " \n" ) \ | cfilter( lambda token: token not in ( " ", "\n" ) ) \ | cmap( lambda token: token.strip() + ' ' ) appendTo(TOKENIZERS)(words) lines = appendTo(TOKENIZERS)(streamTokenizer("\n")) words2 = appendTo(TOKENIZERS)(streamTokenizer(" \n.,;?!")) words3 = appendTo(TOKENIZERS)( streamTokenizer(" \n.,;?!") | cmap(lambda word: word[:-1] + ' ' if word.endswith("\n") and not word == "\n" else word)) MODELS = [] @appendTo(MODELS) def markov(tokenize, hash, bits, corpusFilename, order=1, abridged=None): truncatedHash = truncateHash(hash, bits)
def encoderFactory(): encode = encodeFactory() return toBytes | cmap(ord) | changeWordSize(8, bits) | cmap(encode)
@coroutine def tokenizer(target): value = [] while True: byte = yield value.append(byte) if byte in stopBytes: target.send("".join(value)) value = [] return tokenizer words = streamTokenizer( " \n" ) \ | cfilter( lambda token: token not in ( " ", "\n" ) ) \ | cmap( lambda token: token.strip() + ' ' ) appendTo(TOKENIZERS)(words) asciiwords = streamTokenizer( " \n" ) \ | cmap( lambda token: "".join(b for b in token if 0x20 <= ord(b) <= 0x7e) ) \ | cfilter( lambda token: token not in ( " ", "\n" ) ) \ | cmap( lambda token: token.strip() + ' ' ) appendTo(TOKENIZERS)(asciiwords) lines = appendTo(TOKENIZERS)(streamTokenizer("\n")) words2 = appendTo(TOKENIZERS)(streamTokenizer(" \n.,;?!")) words3 = appendTo(TOKENIZERS)( streamTokenizer(" \n.,;?!") | cmap(lambda word: word[:-1] + ' ' if word.endswith("\n") and not word == "\n" else word))
def rh_decoder ( encodingSpec ): tokenize, hash, bits = parseEncodingSpec( encodingSpec ) return toBytes | tokenize | cmap( truncateHash( hash, bits ) ) | changeWordSize( bits, 8 ) | cmap( chr )
def streamTokenizer ( stopBytes ): "Return a tokenizer coroutine which accumulates bytes until it sees a byte in stopBytes" @coroutine def tokenizer ( target ): value = [] while True: byte = yield value.append( byte ) if byte in stopBytes: target.send( "".join( value ) ) value = [] return tokenizer words = streamTokenizer( " \n" ) \ | cfilter( lambda token: token not in ( " ", "\n" ) ) \ | cmap( lambda token: token.strip() + ' ' ) appendTo(TOKENIZERS)( words ) asciiwords = streamTokenizer( " \n" ) \ | cmap( lambda token: "".join(b for b in token if 0x20 <= ord(b) <= 0x7e) ) \ | cfilter( lambda token: token not in ( " ", "\n" ) ) \ | cmap( lambda token: token.strip() + ' ' ) appendTo(TOKENIZERS)( asciiwords ) lines = appendTo(TOKENIZERS)( streamTokenizer( "\n" ) ) words2 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" ) ) words3 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" ) | cmap( lambda word: word[:-1]+' ' if word.endswith("\n") and not word == "\n" else word ) ) MODELS = []
"Return a tokenizer coroutine which accumulates bytes until it sees a byte in stopBytes" @coroutine def tokenizer ( target ): value = [] while True: byte = yield value.append( byte ) if byte in stopBytes: target.send( "".join( value ) ) value = [] return tokenizer words = streamTokenizer( " \n" ) \ | cfilter( lambda token: token not in ( " ", "\n" ) ) \ | cmap( lambda token: token.strip() + ' ' ) appendTo(TOKENIZERS)( words ) lines = appendTo(TOKENIZERS)( streamTokenizer( "\n" ) ) words2 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" ) ) words3 = appendTo(TOKENIZERS)( streamTokenizer( " \n.,;?!" ) | cmap( lambda word: word[:-1]+' ' if word.endswith("\n") and not word == "\n" else word ) ) MODELS = [] @appendTo(MODELS) def markov ( tokenize, hash, bits, corpusFilename, order=1, abridged=None ): truncatedHash = truncateHash( hash, bits ) corpusTokens = list( tokenize < readTextFile( corpusFilename ) )