Esempio n. 1
0
 def do_test(self, m_class, db, args, i):
    self.i = i
    # create tokenizer
    tzer = u.class_by_name(args.tokenizer)(args.ngram)
    # load training & testing tweets from database
    exu = None if args.dup_users else set()
    (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer,
                                       args.fields, args.unify_fields, exu)
    exu = None if args.dup_users else tr_users
    (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer,
                                args.fields, args.unify_fields, exu)
    if (not args.skip_small_tests or
         self.enough_data_p(len(tr_tweets), len(te_tweets))):
       self.attempted = True
    else:
       l.info('insufficient data, skipping test %s ' % (self))
       self.attempted = False
       self.results = []
       return
    # tokenize training tweets
    tr_tokens = self.group_tokens(tr_tweets,
                                  args.trim_head, args.min_instances)
    self.train_tweet_ct = len(tr_tweets)
    self.train_token_ct = len(tr_tokens)
    # downsample test tweets
    if (len(te_tweets) > args.test_tweet_limit):
       te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit)
       l.info('sampled %d test tweets per --test-tweet-limit'
              % (args.test_tweet_limit))
    self.test_tweet_ct = len(te_tweets)
    # build model
    self.model = m_class(tr_tokens, args.srid, tr_tweets)
    l.debug('starting model build')
    t_start = time.time()
    self.model.build()
    l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start)))
    t_start = time.time()
    # test 'em
    self.results = multicore.do(test_tweet,
                                (self.model, args.fields), te_tweets)
    l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
Esempio n. 2
0
 def do_test(self, m_class, db, args, i):
     self.i = i
     # create tokenizer
     tzer = u.class_by_name(args.tokenizer)(args.ngram)
     # load training & testing tweets from database
     exu = None if args.dup_users else set()
     (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer,
                                        args.fields, args.unify_fields, exu)
     exu = None if args.dup_users else tr_users
     (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer,
                                 args.fields, args.unify_fields, exu)
     if (not args.skip_small_tests
             or self.enough_data_p(len(tr_tweets), len(te_tweets))):
         self.attempted = True
     else:
         l.info('insufficient data, skipping test %s ' % (self))
         self.attempted = False
         self.results = []
         return
     # tokenize training tweets
     tr_tokens = self.group_tokens(tr_tweets, args.trim_head,
                                   args.min_instances)
     self.train_tweet_ct = len(tr_tweets)
     self.train_token_ct = len(tr_tokens)
     # downsample test tweets
     if (len(te_tweets) > args.test_tweet_limit):
         te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit)
         l.info('sampled %d test tweets per --test-tweet-limit' %
                (args.test_tweet_limit))
     self.test_tweet_ct = len(te_tweets)
     # build model
     self.model = m_class(tr_tokens, args.srid, tr_tweets)
     l.debug('starting model build')
     t_start = time.time()
     self.model.build()
     l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start)))
     t_start = time.time()
     # test 'em
     self.results = multicore.do(test_tweet, (self.model, args.fields),
                                 te_tweets)
     l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
Esempio n. 3
0
File: gmm.py Progetto: aronwc/quac
def wt_inv_error(tms, tweets, tokenpts, errattr):
   '''Weight of token T is |1/E^x|, where E is the mean error between T and
      each tweet in tweets having that token, using measure errattr ('sae' or
      'cae'), and x is model parm wt_inv_error_exponent. The number of samples
      used in computing CAE is model parm wt_inv_sample_ct. If the number of
      tweets with the token is less than model parm wt_inv_min_tweets, the
      weight is 0.'''
   l.debug('computing inverse errors')
   t1 = time.time()
   # We work in chunks to keep memory use down. The chunk size is currently
   # not configurable, though we could make it so if needed.
   models = tms.values()
   weights = dict()
   x = model_parms['wt_inv_error_exponent']
   for chunk in u.groupn(models, 20000):
      weights.update((tok, min(1, abs(1/(1+err**x))))
                     for (tok, err)
                     in multicore.do(model_error, (errattr, tokenpts), chunk))
      l.debug('inverse error chunk completed')
   dur = time.time() - t1
   l.debug('computed inverse errors in %s (%.2gs per token)'
           % (u.fmt_seconds(dur), dur / len(models)))
   return weights
Esempio n. 4
0
File: gmm.py Progetto: shikhach/quac
def wt_inv_error(tms, tweets, tokenpts, errattr):
    '''Weight of token T is |1/E^x|, where E is the mean error between T and
      each tweet in tweets having that token, using measure errattr ('sae' or
      'cae'), and x is model parm wt_inv_error_exponent. The number of samples
      used in computing CAE is model parm wt_inv_sample_ct. If the number of
      tweets with the token is less than model parm wt_inv_min_tweets, the
      weight is 0.'''
    l.debug('computing inverse errors')
    t1 = time.time()
    # We work in chunks to keep memory use down. The chunk size is currently
    # not configurable, though we could make it so if needed.
    models = tms.values()
    weights = dict()
    x = model_parms['wt_inv_error_exponent']
    for chunk in u.groupn(models, 20000):
        weights.update(
            (tok, min(1, abs(1 / (1 + err**x))))
            for (tok,
                 err) in multicore.do(model_error, (errattr, tokenpts), chunk))
        l.debug('inverse error chunk completed')
    dur = time.time() - t1
    l.debug('computed inverse errors in %s (%.2gs per token)' %
            (u.fmt_seconds(dur), dur / len(models)))
    return weights
Esempio n. 5
0
File: gmm.py Progetto: aronwc/quac
 def build(self):
    self.token_gmms = dict(multicore.do(gmm_fit_tokenpoints,
                                        (), self.tokens.items()))
    self.token_weights = model_parms['weight_f'](self.token_gmms,
                                                 self.tweets, self.tokens)
Esempio n. 6
0
File: gmm.py Progetto: shikhach/quac
 def build(self):
     self.token_gmms = dict(
         multicore.do(gmm_fit_tokenpoints, (), self.tokens.items()))
     self.token_weights = model_parms['weight_f'](self.token_gmms,
                                                  self.tweets, self.tokens)