check.''' for tag in self.out_tags: if (self.fs.exists(tag)): raise ValueError('tag %s already exists' % (tag)) def run(self, **kw): if ('required_modules' in kw): raise ValueError( 'required_modules is not supported by this subclass') kw['required_modules'] = [('mr_path', os.path.abspath(mr_path.__file__))] super(Job, self).run(**kw) # Some mixins for various handy behavior. class TSV_Reader_Job(object): @staticmethod def map_reader(fp, size, url, params): # Note: I can't find this in the docs or the source, but fp is # apparently a regular old open file, at least on my one-node tests. I # suspect this may be different when nodes start to send each other data # over HTTP, but we'll cross that bridge when we come to it. fp_unicode = io.open(fp.fileno(), encoding='utf8') for line in fp_unicode: yield line.split('\t') testable.manualonly_register('')
'''Raise ValueError if any of the output tags already exist. Note that there's a race condition here, so it's nothing more than a sanity check.''' for tag in self.out_tags: if (self.fs.exists(tag)): raise ValueError('tag %s already exists' % (tag)) def run(self, **kw): if ('required_modules' in kw): raise ValueError('required_modules is not supported by this subclass') kw['required_modules'] = [('mr_path', os.path.abspath(mr_path.__file__))] super(Job, self).run(**kw) # Some mixins for various handy behavior. class TSV_Reader_Job(object): @staticmethod def map_reader(fp, size, url, params): # Note: I can't find this in the docs or the source, but fp is # apparently a regular old open file, at least on my one-node tests. I # suspect this may be different when nodes start to send each other data # over HTTP, but we'll cross that bridge when we come to it. fp_unicode = io.open(fp.fileno(), encoding='utf8') for line in fp_unicode: yield line.split('\t') testable.manualonly_register('')
class Tiny_ICU(base.Tzer): u'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese. Ignores everything else. E.g.: >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS True >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True''' def __init__(self, ngram): base.Tzer.__init__(self, ngram) self.tiny = tiny.Tzer(ngram) self.icu = ICU(ngram) def tokenize_real(self, text): ws_tokens = text.split() tokens = [] for ws_token in ws_tokens: if (is_latin(ws_token)): tokens.extend(self.icu.tokenize(ws_token)) elif (is_japanese(ws_token)): tokens.extend(self.tiny.tokenize(ws_token)) return tokens testable.manualonly_register(u''' >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True ''')
u'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese. Ignores everything else. E.g.: >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS True >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True''' def __init__(self, ngram): base.Tzer.__init__(self, ngram) self.tiny = tiny.Tzer(ngram) self.icu = ICU(ngram) def tokenize_real(self, text): ws_tokens = text.split() tokens = [] for ws_token in ws_tokens: if (is_latin(ws_token)): tokens.extend(self.icu.tokenize(ws_token)) elif (is_japanese(ws_token)): tokens.extend(self.tiny.tokenize(ws_token)) return tokens testable.manualonly_register(u''' >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS True ''')
for gmm in self.all_gmms]) if self.verbose: for (fv,fi) in self.feature_alphabet.items(): l.debug('feature weight %s=%g' % (fv,res.x[fi])) for (t,w) in di.items(): l.debug('token weight %s=%s'%(t,str(w))) # clean up for g in self.all_gmms: g.feature_vector = None return di # test that self.all_gmms has stable order # disabled for now (see issue #100) testable.manualonly_register(''' >>> import random >>> from . import gmm >>> def test_random(): ... u.rand = random.Random(123) ... gmm.Token.parms_init({}) ... mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326) ... m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a') ... m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b') ... m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c') ... m = Weight([[m1, m2], [m2, m3], [m1, m3]], ... [[100, 50], [50, 200], [80, 400]], identity_feature=True, ... misc_feature=False) ... return list(m.all_gmms) >>> all((test_random()[0].tokens == test_random()[0].tokens for i in range(100))) True ''')
if self.verbose: for (fv, fi) in self.feature_alphabet.items(): l.debug('feature weight %s=%g' % (fv, res.x[fi])) for (t, w) in di.items(): l.debug('token weight %s=%s' % (t, str(w))) # clean up for g in self.all_gmms: g.feature_vector = None return di # test that self.all_gmms has stable order # disabled for now (see issue #100) testable.manualonly_register(''' >>> import random >>> from . import gmm >>> def test_random(): ... u.rand = random.Random(123) ... gmm.Token.parms_init({}) ... mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326) ... m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a') ... m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b') ... m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c') ... m = Weight([[m1, m2], [m2, m3], [m1, m3]], ... [[100, 50], [50, 200], [80, 400]], identity_feature=True, ... misc_feature=False) ... return list(m.all_gmms) >>> all((test_random()[0].tokens == test_random()[0].tokens for i in range(100))) True ''')