def do_test(self, m_class, db, args, i): self.i = i # create tokenizer tzer = u.class_by_name(args.tokenizer)(args.ngram) # load training & testing tweets from database exu = None if args.dup_users else set() (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer, args.fields, args.unify_fields, exu) exu = None if args.dup_users else tr_users (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer, args.fields, args.unify_fields, exu) if (not args.skip_small_tests or self.enough_data_p(len(tr_tweets), len(te_tweets))): self.attempted = True else: l.info('insufficient data, skipping test %s ' % (self)) self.attempted = False self.results = [] return # tokenize training tweets tr_tokens = self.group_tokens(tr_tweets, args.trim_head, args.min_instances) self.train_tweet_ct = len(tr_tweets) self.train_token_ct = len(tr_tokens) # downsample test tweets if (len(te_tweets) > args.test_tweet_limit): te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit) l.info('sampled %d test tweets per --test-tweet-limit' % (args.test_tweet_limit)) self.test_tweet_ct = len(te_tweets) # build model self.model = m_class(tr_tokens, args.srid, tr_tweets) l.debug('starting model build') t_start = time.time() self.model.build() l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start))) t_start = time.time() # test 'em self.results = multicore.do(test_tweet, (self.model, args.fields), te_tweets) l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
def wt_inv_error(tms, tweets, tokenpts, errattr): '''Weight of token T is |1/E^x|, where E is the mean error between T and each tweet in tweets having that token, using measure errattr ('sae' or 'cae'), and x is model parm wt_inv_error_exponent. The number of samples used in computing CAE is model parm wt_inv_sample_ct. If the number of tweets with the token is less than model parm wt_inv_min_tweets, the weight is 0.''' l.debug('computing inverse errors') t1 = time.time() # We work in chunks to keep memory use down. The chunk size is currently # not configurable, though we could make it so if needed. models = tms.values() weights = dict() x = model_parms['wt_inv_error_exponent'] for chunk in u.groupn(models, 20000): weights.update((tok, min(1, abs(1/(1+err**x)))) for (tok, err) in multicore.do(model_error, (errattr, tokenpts), chunk)) l.debug('inverse error chunk completed') dur = time.time() - t1 l.debug('computed inverse errors in %s (%.2gs per token)' % (u.fmt_seconds(dur), dur / len(models))) return weights
def wt_inv_error(tms, tweets, tokenpts, errattr): '''Weight of token T is |1/E^x|, where E is the mean error between T and each tweet in tweets having that token, using measure errattr ('sae' or 'cae'), and x is model parm wt_inv_error_exponent. The number of samples used in computing CAE is model parm wt_inv_sample_ct. If the number of tweets with the token is less than model parm wt_inv_min_tweets, the weight is 0.''' l.debug('computing inverse errors') t1 = time.time() # We work in chunks to keep memory use down. The chunk size is currently # not configurable, though we could make it so if needed. models = tms.values() weights = dict() x = model_parms['wt_inv_error_exponent'] for chunk in u.groupn(models, 20000): weights.update( (tok, min(1, abs(1 / (1 + err**x)))) for (tok, err) in multicore.do(model_error, (errattr, tokenpts), chunk)) l.debug('inverse error chunk completed') dur = time.time() - t1 l.debug('computed inverse errors in %s (%.2gs per token)' % (u.fmt_seconds(dur), dur / len(models))) return weights
def build(self): self.token_gmms = dict(multicore.do(gmm_fit_tokenpoints, (), self.tokens.items())) self.token_weights = model_parms['weight_f'](self.token_gmms, self.tweets, self.tokens)
def build(self): self.token_gmms = dict( multicore.do(gmm_fit_tokenpoints, (), self.tokens.items())) self.token_weights = model_parms['weight_f'](self.token_gmms, self.tweets, self.tokens)