Example #1
0
def CLI(parser, callback, log):
    #standard arguments for logging
    parser.add_argument("--silent",
                        help="Turn off logging",
                        action='store_true',
                        default=False)
    parser.add_argument("--log-file",
                        help="Output log file (default: standard output)",
                        default=None)

    args = parser.parse_args()

    #set up logging
    log.silent = args.silent
    if args.log_file:
        log.stream = open(args.log_file, 'w')

    start = datetime.now()
    log.info("Started at: %s\n" % str(start))

    #run script
    callback(args)

    end = datetime.now()
    time_taken = format_seconds((end - start).seconds)

    log.info("Completed at: %s\n" % str(end))
    log.info("- Total time: %s\n" % str(time_taken))
Example #2
0
 def cmd_uptime(self, args, tojid, typ="chat"):
     """
     calculates, formats it human readable and then echos the uptime
     """
     uptime = datetime.datetime.today()-self.birthday
     # python >= 2.7: http://docs.python.org/library/datetime.html#datetime.timedelta.total_seconds
     #uptimestr = utils.format_seconds(uptime.total_seconds())
     # for compatibility with python < 2.7
     uptimestr = utils.format_seconds((uptime.microseconds + (uptime.seconds + uptime.days * 24 * 3600) * 10 ** 6) / 10 ** 6)
     self.send(tojid, uptimestr, typ)
    def eval_bleu(self):
        self.logger.info('Evaluate dev BLEU')
        start = time.time()
        self.model.eval()
        avg_bleus = []
        dump_dir = self.args.dump_dir
        with torch.no_grad():
            for pair in self.pairs:
                self.logger.info('--> {}'.format(pair))
                src_lang, tgt_lang = pair.split('2')
                src_lang_idx = self.data_manager.lang_vocab[src_lang]
                tgt_lang_idx = self.data_manager.lang_vocab[tgt_lang]
                logit_mask = self.data_manager.logit_masks[tgt_lang]
                data = self.data_manager.translate_data[pair]
                src_batches = data['src_batches']
                sorted_idxs = data['sorted_idxs']
                ref_file = data['ref_file']

                all_best_trans, all_beam_trans = self._translate(src_batches, sorted_idxs, src_lang_idx, tgt_lang_idx, logit_mask)

                all_best_trans = ''.join(all_best_trans)
                best_trans_file = join(dump_dir, '{}_val_trans.txt.bpe'.format(pair))
                open(best_trans_file, 'w').close()
                with open(best_trans_file, 'w') as fout:
                    fout.write(all_best_trans)

                all_beam_trans = ''.join(all_beam_trans)
                beam_trans_file = join(dump_dir, '{}_beam_trans.txt.bpe'.format(pair))
                open(beam_trans_file, 'w').close()
                with open(beam_trans_file, 'w') as fout:
                    fout.write(all_beam_trans)

                # merge BPE
                nobpe_best_trans_file = join(dump_dir, '{}_val_trans.txt'.format(pair))
                ut.remove_bpe(best_trans_file, nobpe_best_trans_file)
                nobpe_beam_trans_file = join(dump_dir, '{}_beam_trans.txt'.format(pair))
                ut.remove_bpe(beam_trans_file, nobpe_beam_trans_file)

                # calculate BLEU
                bleu, msg = ut.calc_bleu(self.args.bleu_script, nobpe_best_trans_file, ref_file)
                self.logger.info(msg)
                avg_bleus.append(bleu)
                self.stats[pair]['dev_bleus'].append(bleu)

                # save translation with BLEU score for future reference
                trans_file = '{}-{}'.format(nobpe_best_trans_file, bleu)
                shutil.copyfile(nobpe_best_trans_file, trans_file)
                beam_file = '{}-{}'.format(nobpe_beam_trans_file, bleu)
                shutil.copyfile(nobpe_beam_trans_file, beam_file)

        avg_bleu = sum(avg_bleus) / len(avg_bleus)
        self.stats['avg_bleus'].append(avg_bleu)
        self.logger.info('avg_bleu = {}'.format(avg_bleu))
        self.logger.info('Done evaluating dev BLEU, it takes {} seconds'.format(ut.format_seconds(time.time() - start)))
Example #4
0
def trim_short_tweets(cutoff=20):
    """
    utility function for deleting short tweets from our database
    cutoff represents the rough percentage of tweets to be deleted
    """
    load_time = time.time()
    db = lite.connect(TWEET_DB_PATH)
    cursor = db.cursor()
    cursor.execute("SELECT hash FROM tweets")
    hashes = cursor.fetchall()
    hashes = set([str(h) for (h,) in hashes])
    print('extracted %i hashes in %s' % (len(hashes), utils.format_seconds(time.time()-load_time)))
    short_hashes = [h for h in hashes if len(h) < cutoff]
    print("found %i of %i hashes below %i character cutoff" % (len(short_hashes), len(hashes), cutoff))
    load_time = time.time()
    hashvals = ["'%s'" % h for h in short_hashes]
    db.execute("DELETE FROM tweets WHERE hash IN (%s)" % ",".join(hashvals))
    # self.cache.executemany("DELETE FROM tweets WHERE hash=(?)", iter(short_hashes))
    db.commit()
    print('deleted %i hashes in %s' % (len(short_hashes), utils.format_seconds(time.time()-load_time)))
Example #5
0
    def train(self):
        env = self.env
        agent = self.agent
        args = self.args

        # pdb.set_trace()
        t = tqdm.tqdm()
        while not env.done():
            self.step += 1
            state = env.state()
            action = agent.get_action(state)
            _, log = env.step(action)
            self.print("> Environment step logging:\tstep %d\taction: %r\t%s" %
                       (self.step, env.wrap_action(action), log_str(log)))
            # pdb.set_trace()
            if self.step % args.rl_search_reward_interval == 0:
                reward, student_metric, log = env.reward()
                self.print(
                    "> Environment reward logging:\tstep %d\treward: %.4f, student metric: %.4f\t%s"
                    % (self.step, reward, student_metric, log_str(log)))
                agent.update_reward(reward)
                if self.step % args.rl_search_learn_interval == 0:
                    agent.learn()
                    agent.reset()
                    assert self.step % args.rl_search_save_interval == 0
                    self.save(student_metric)
            elif self.step % args.rl_search_save_interval == 0:
                student_metric, log = env.validate()
                self.print(
                    "> Environment validate logging:\tstep %d\tstudent metric: %.4f\t%s"
                    % (self.step, student_metric, log_str(log)))
                self.save(student_metric)
            t.update(1)
        self.clear_print_buffer()
        print("Avg time: %s" % format_seconds(t.avg_time))
        print("Total: %d steps" % t.n)
        print("Total time: %s" % format_seconds(t.avg_time * t.n))
        t.close()
Example #6
0
def archive_old_tweets(cutoff=0.2):
    """cutoff represents the rough fraction of tweets to be archived"""
    load_time = time.time()
    db = lite.connect(TWEET_DB_PATH)
    cursor = db.cursor()
    cursor.execute("SELECT id FROM tweets")
    ids = cursor.fetchall()
    ids = [str(h) for (h,) in ids]
    print('extracted %i ids in %s' % (len(ids), utils.format_seconds(time.time()-load_time)))
    ids = sorted(ids)
    tocull = int(len(ids) * cutoff)
    ids = ids[:tocull]
    print('found %i old tweets' % len(ids))
    load_time = time.time()
    ids = ["'%s'" % i for i in ids]
    # todo we actually want to archive this stuff tho
    cursor.execute("SELECT * FROM tweets WHERE id IN (%s)" % ",".join(ids))
    results = cursor.fetchall()
    db.execute("DELETE FROM tweets WHERE id IN (%s)" % ",".join(ids))
    db.commit()
    filename = "data/culled_%s.p" % time.strftime("%b%d%H%M")
    pickle.dump(results, open(filename, 'wb'))
    print('archived %i hashes in %s' % (len(ids), utils.format_seconds(time.time()-load_time)))
Example #7
0
def get_beacon_ratio(ts):
    """
    Calculate ratio of records that were beaconing to all records.
    Detect beaconing by checking for consistent inter-arrival times.

    Assumptions:
     - beaconing is at least 50-80% of the overall activity
     - beacons don't span multiple rows over multiple seconds
    """
    ts = ts.sort_values().drop_duplicates()
    timedelta = (ts - ts.shift(1))
    ts_beacon = ts[timedelta.between(timedelta.median() - timedelta.std(),
                                     timedelta.median() + timedelta.std())]
    ratio = len(ts_beacon) / float(len(ts))
    return pd.DataFrame([{'ratio':       ratio,
                          'timedelta_s': timedelta.median().total_seconds(),
                          'timedelta_f': format_seconds(timedelta.median().total_seconds())}])
Example #8
0
    def update_console(self):
        """
        prints various bits of status information to the console.
        """
        # what all do we want to have, here? let's blueprint:
        # tweets seen: $IN_HAS_TEXT passed filter: $PASSED_F% Hits: $HITS
        seen_percent = int(100*(float(
            self.stream_handler.passed_filter)/self.stream_handler.tweets_seen))
        runtime = time.time()-self.stats.start_time

        status = (
            'tweets seen: ' + str(self.stream_handler.tweets_seen) +
            " passed filter: " + str(self.stream_handler.passed_filter) +
            " ({0}%)".format(seen_percent) +
            " hits " + str(self.stats.possible_hits) +
            " agrams: " + str(self.stats.hits) +
            " buffer: " + str(self.stream_handler.bufferlength()) +
            " runtime: " + utils.format_seconds(runtime)
        )
        sys.stdout.write(status + '\r')
        sys.stdout.flush()
Example #9
0
    def action_get_usage(self, **kwargs):
        endpoint = '/user/usage'
        response = self._make_request(endpoint)

        # base_bandwidth = response['baseBandwidth']
        # offer_bonus_earned = response['offerBonusEarned']
        total_limit = response['totalLimit']
        percent_used = response['percentUsed'] * 100
        balance_remaining = response['balanceRemaining']
        #
        used = total_limit - balance_remaining

        msg = 'Data plan usage information:\n\n'
        msg += (
            'You have used %d MB (%.1f%%) out of your plan of %d MB.\n\n' %
            (utils.b_to_mb(used), percent_used, utils.b_to_mb(total_limit)))

        end_time = response['endTime'] / 1000
        time_remaining = end_time - time.time()
        msg += 'Time until quota reset: '
        msg += utils.format_seconds(time_remaining)

        return msg
Example #10
0
 def process_results(self, store):
     for user, edits in store.query.rawresults.user_edits.items():
         newedits = []
         for edit in edits:
             edit = [
                 s.decode("utf8") if isinstance(s, bytes) else s
                 for s in edit
             ]
             t = ''
             if store.namespaces[edit[0]]:
                 t = store.namespaces[edit[0]] + ":"
             newedits.append((t + edit[1], self.format_date(edit[2])))
         newedits.sort(key=lambda x: x[1].timestamp)
         store.query.rawresults.user_edits[user] = newedits
     common_pages = set()
     if store.allusers:
         allpages = []
         for user in store.users:
             allpages.append(
                 {p[0]
                  for p in store.query.rawresults.user_edits[user]})
         common_pages = set.intersection(*allpages)
     else:
         for user in store.users:
             unionset = set()
             for user2 in store.users:
                 if user == user2:
                     continue
                 unionset.update(
                     edit[0]
                     for edit in store.query.rawresults.user_edits[user2])
                 common_pages.update({
                     edit[0]
                     for edit in store.query.rawresults.user_edits[user]
                 } & unionset)
     # Find minimum time between edits by different users on the same page
     # make dict {page: [edit, edit, edit...]} and get the smallest
     # diff between 2 timestamps by different users
     page_scores = {}
     page_hist = {}
     for page in common_pages:
         page_edits = []
         diffs = []
         for user, edits in store.query.rawresults.user_edits.items():
             page_edits.extend((user, e[1]) for e in edits if e[0] == page)
         page_edits.sort(key=lambda x: x[1])
         page_hist[page] = page_edits
         page_edits = self.snip_between(page_edits, key=lambda ed: ed[0])
         for i in range(len(page_edits) - 1, 0, -1):
             if page_edits[i][0] != page_edits[i - 1][0]:
                 # I wish I could use the diff function I wrote :(
                 diffs.append(page_edits[i][1] - page_edits[i - 1][1])
         if diffs:
             smallest = min(diffs)
             fuzziness = timedelta(days=365)
             if smallest < fuzziness or 1 == 1:  # TODO: GET param this shit
                 page_scores[page] = smallest
     table = OrderedDict()
     for page in sorted(page_scores, key=page_scores.get):
         table[page] = []
         table[page].append(page)  # title stuff - index 0
         duration = format_seconds(page_scores[page])
         timeline_url = '/sigma/timeline.py'
         fragment = {
             "page": page,
             "users": tuple(store.users),
             "server": store.server
         }
         if store.startdate:
             fragment['startdate'] = store.startdate
         if store.enddate:
             fragment['enddate'] = store.enddate
         timeline_url += "?" + urlencode(fragment, doseq=True)
         table[page].append([duration, timeline_url
                             ])  # min time between edits - index 1
         for user in store.users:
             p_hist = page_hist[page]
             user_is_first = p_hist[0][0] == user
             editcount = len([e for e in p_hist if e[0] == user])
             table[page].append(
                 (user_is_first,
                  editcount))  # edit count and blue-ness for the user
     store.table = table
Example #11
0
    def parse_log_file(self):
        self.runtime = None
        self.sent = None
        self.received = None
        self.speed = None

        logger.debug('Parsing log file for snapshot. %r' % {
            'volume_id': self.volume.id,
            'snapshot_id': self.id,
        })

        try:
            with open(self.log_path) as log:
                start_time = None
                line = log.readline()

                try:
                    line_split = shlex.split(line)

                    if len(line_split) >= 2:
                        # Get epoch time
                        epoch = line_split[0] + 'T' + line_split[1]
                        epoch = datetime.datetime.strptime(epoch,
                            '%Y/%m/%dT%H:%M:%S')
                        start_time = int(time.mktime(epoch.timetuple()))
                    else:
                        logger.warning('Failed to get snapshot start ' + \
                            'time from log, line split length invalid. %r' % {
                                'volume_id': self.volume.id,
                                'snapshot_id': self.id,
                                'log_line': line,
                            })

                except ValueError:
                    logger.warning('Failed to get snapshot start ' + \
                        'time from log, value error. %r' % {
                            'volume_id': self.volume.id,
                            'snapshot_id': self.id,
                            'log_line': line,
                        })

                # Get last kilobyte of file
                log.seek(0, os.SEEK_END)
                file_size = log.tell()
                log.seek(max(file_size - 1024, 0))
                lines = log.readlines()

                # Find rsync sent command line output
                for line in lines:
                    try:
                        line_split = shlex.split(line)
                    except ValueError:
                        continue

                    if len(line_split) < 10:
                        continue

                    # Get rsync command
                    command = line_split[3]
                    if command == 'sent':
                        if start_time:
                            # Get runtime
                            epoch = line_split[0] + 'T' + line_split[1]
                            epoch = datetime.datetime.strptime(epoch,
                                '%Y/%m/%dT%H:%M:%S')
                            epoch = int(time.mktime(epoch.timetuple()))
                            self.runtime = utils.format_seconds(
                                epoch - start_time)

                        # Get snapshot info
                        try:
                            self.sent = utils.format_bytes(line_split[4])
                        except ValueError:
                            logger.warning('Failed to get sent bytes ' + \
                                'from snapshot log, value error. %r' % {
                                    'volume_id': self.volume.id,
                                    'snapshot_id': self.id,
                                    'log_line': line,
                                })

                        try:
                            self.received = utils.format_bytes(line_split[7])
                        except ValueError:
                            logger.warning('Failed to get received bytes ' + \
                                'from snapshot log, value error. %r' % {
                                    'volume_id': self.volume.id,
                                    'snapshot_id': self.id,
                                    'log_line': line,
                                })

                        try:
                            self.speed = utils.format_bytes(
                                line_split[9]) + '/sec'
                        except ValueError:
                            logger.warning('Failed to get transfer speed ' + \
                                'from snapshot log, value error. %r' % {
                                    'volume_id': self.volume.id,
                                    'snapshot_id': self.id,
                                    'log_line': line,
                                })
        except IOError:
            logger.debug('Failed to read log file for ' + \
                'snapshot, IOError. %r' % {
                    'volume_id': self.volume.id,
                    'snapshot_id': self.id,
                })
Example #12
0
def main(args):
    mini_str = '/mini' if args.mini else ''  # path to mini dataset
    version_suffix = '_v2.0' if args.squad_version == 2.0 else ''  # gets proper dataset version (1.1 or 2.0)

    # Prepare output directory under ./weights/ to store model-specific data including weights
    out_dir = 'weights/%s' % args.experiment
    if os.path.exists(out_dir):
        print(
            'Warning - you are overwriting previous experiment %s. Hit Ctrl Z to abort.\n'
            % args.experiment)
        shutil.rmtree(out_dir)
    os.mkdir(out_dir)
    logger = open(os.path.join(out_dir, 'log.txt'), 'w')
    print_and_log(
        'Timestamp = %s for %s\n' %
        (datetime.strftime(datetime.now(), '%m/%d/%Y %H:%M'), args.experiment),
        logger)

    # Load Dev Data and save it to this model's weights dir
    print_and_log('Loading v%s Dev Data...' % args.squad_version, logger)
    dev_data = load_pk('preprocess/data%s/squad_dev_trees%s.npy' %
                       (mini_str, version_suffix))
    dev_batcher = Batcher(dev_data,
                          is_train=False,
                          target_batch_size=args.batch_size)
    save_as_pk(dev_batcher, os.path.join(out_dir, 'dev_batcher.npy'))
    print_and_log('Loaded Dev Data...', logger)

    # Load Train Data and save it to this model's weights dir
    print_and_log('Loading v%s Train Data...' % args.squad_version, logger)
    train_data = load_pk('preprocess/data%s/squad_train_trees%s.npy' %
                         (mini_str, version_suffix))
    train_batcher = Batcher(train_data,
                            is_train=True,
                            target_batch_size=args.batch_size)
    print_and_log('Loaded Train Data...', logger)

    # Create models and optimizers
    span_extractor = TreeLSTM(use_cuda=args.cuda)
    answer_verifier = AnswerVerifier(use_cuda=args.cuda)

    if args.cuda:
        span_extractor.cuda()
        answer_verifier.cuda()

    span_extractor_grad_params = filter(lambda p: p.requires_grad,
                                        span_extractor.parameters())
    span_extractor_optimizer = optim.Adam(span_extractor_grad_params,
                                          args.span_extractor_lr)

    answer_verifier_grad_params = filter(lambda p: p.requires_grad,
                                         answer_verifier.parameters())
    answer_verifier_optimizer = optim.Adam(answer_verifier_grad_params,
                                           args.answer_verifier_lr)

    # Determines if question is answerable or not
    answer_verifier_logistic_loss = BCEWithLogitsLoss(
        pos_weight=span_extractor.cudify(torch.FloatTensor([0.5])))

    best_span_f1 = -1  # Keep track of which epoch model achieves highest span level F1 on the dev set
    best_answer_verifier_accuracy = -1
    best_span_epoch = -1
    best_answer_verifier_epoch = -1
    for epoch_idx in range(args.epochs):
        print_and_log('Starting Epoch %d...' % (epoch_idx + 1), logger)

        train_evaluator = Evaluator(
            'train'
        )  # Stores predictions and returns evaluation string at the end of epoch
        dev_evaluator = Evaluator('dev')

        start_time = time()

        span_extractor.train()
        answer_verifier.train()
        while train_batcher.has_next():
            # Clear gradients and get next batch
            span_extractor_optimizer.zero_grad()
            answer_verifier_optimizer.zero_grad()

            joint_loss = _run_batch(
                batch=train_batcher.next(),
                span_extractor=span_extractor,
                span_extractor_optimizer=span_extractor_optimizer,
                answer_verifier=answer_verifier,
                answer_verifier_optimizer=answer_verifier_optimizer,
                answer_verifier_logistic_loss=answer_verifier_logistic_loss,
                evaluator=train_evaluator)

            joint_loss.backward()

            # Make a gradient step
            span_extractor_optimizer.step()
            answer_verifier_optimizer.step()
        print_and_log('Took %s.' % format_seconds(time() - start_time), logger)
        print_and_log('\t' + train_evaluator.eval_string(), logger)

        span_extractor.eval()
        answer_verifier.eval()
        while dev_batcher.has_next():
            _run_batch(
                batch=dev_batcher.next(),
                span_extractor=span_extractor,
                span_extractor_optimizer=span_extractor_optimizer,
                answer_verifier=answer_verifier,
                answer_verifier_optimizer=answer_verifier_optimizer,
                answer_verifier_logistic_loss=answer_verifier_logistic_loss,
                evaluator=dev_evaluator)

        print_and_log('\t' + dev_evaluator.eval_string(), logger)
        dev_f1 = dev_evaluator.span_f1()
        if dev_f1 > best_span_f1:
            best_span_f1 = dev_f1
            best_span_epoch = epoch_idx + 1
            torch.save(span_extractor,
                       os.path.join(out_dir, 'best_span_extractor.tar'))

        dev_answer_verifier_accuracy = dev_evaluator.avg_answer_accuracy()
        if dev_answer_verifier_accuracy > best_answer_verifier_accuracy:
            best_answer_verifier_accuracy = dev_answer_verifier_accuracy
            best_answer_verifier_epoch = epoch_idx + 1
            torch.save(answer_verifier,
                       os.path.join(out_dir, 'best_answer_verifier.tar'))

    print_and_log(
        '\nBest span = %.4f F1 at %d epoch' % (best_span_f1, best_span_epoch),
        logger)
    print_and_log(
        '\nBest answer verifier = %.4f accuracy at %d epoch' %
        (best_answer_verifier_accuracy, best_answer_verifier_epoch), logger)
Example #13
0
hasrecord = (len(records) > 0)

if hasrecord:
  totalprice = float(bytesin + bytesout) / 1000 / 1000 / 1000 * price_per_gb
  starttime = records[-1].time
  starthash = records[-1].hashstr
  endtime = records[0].time
  endhash = records[0].hashstr
  log = add_log(title="BillCalc", params=[
      ("user",username),
      ("starttime",starttime.strftime(timefmt)),
      ("starthash",starthash),
      ("endtime",endtime.strftime(timefmt)),
      ("endhash",endhash),
      ("bytesin",bytesin),
      ("bytesout",bytesout),
      ("duration",duration),
      ("totalprice",totalprice)
    ])

if hasrecord:
  print("Usage for user [%s]:" % username)
  print("  From <%s> to <%s>" % (starttime.strftime("%Y-%m-%d %H:%M:%S"), endtime.strftime("%Y-%m-%d %H:%M:%S")))
  print("    Total upload: " + format_bytes(bytesin))
  print("    Total download: " + format_bytes(bytesout))
  print("    Total usage: " + format_seconds(duration))
  print("    Total price: " + str(totalprice))
  print("  Hash code: " + log.hashstr)
else:
  print("No usage for user [%s] since <%s>" % (username, datetime.datetime.strptime(starttime,timefmt).strftime("%Y-%m-%d %H:%M:%S") if starttime else "ever"))