def _feed_queue(self, first_call=False):
        """
            Read the next inqueued e-mails.

            :param bool first_call: Whether this is the first time this method is called (default False)
        """
        if not first_call:
            self._delete_messages()

        self._queue = []
        self._uids = []
        self._current = 0

        while not len(self._queue):
            _, data = self._imap.search(None, 'ALL')
            uids = data[0].split()
            msg_pack = uids[:10] if len(uids) > 10 else uids
            for num in msg_pack:
                # Skip mails that previously failed
                if num in self._failed_uids:
                    continue

                _, raw_msg = self._imap.fetch(num, '(RFC822)')
                self._queue.append(raw_msg[0][1])
                self._uids.append(num)

            if not len(self._queue):
                LOGGER.debug('No email retrieved. Waiting before retrying.')
                time.sleep(10)
    def _create_collection(self, name):
        """
            Create a single collection.

            :param str name: Name of the collection to create
        """
        LOGGER.info('Creating collection [%s]...', name)
        self._db.create_collection(name)
Example #3
0
def send_buffer(data_buffer, token):
    data = {'system_token': token,
            'send_timestamp': time.time(),
            'data': data_buffer
            }
    req = requests.put(CONFIG.SERVER_ADDR + MEASURES_RES, json=data)
    if req.ok:
        LOGGER.info('Data pack sent')
Example #4
0
    def _send_mail(self, subject, body):
        """
            Send a simple text e-mail. Settings are used to get the recipient.

            :param str subject: Subject of the email
            :param str body: Body content of the email
        """
        try:
            msg = MIMEText(body)

            msg['Subject'] = subject
            msg['From'] = settings.SCORING_EMAIL['reporting']['from']
            msg['To'] = settings.SCORING_EMAIL['reporting']['to']

            smtp = smtplib.SMTP_SSL(settings.SCORING_EMAIL['host'])
            smtp.sendmail(msg['From'], msg['To'], msg.as_string())
            smtp.quit()
        except Exception as ex:
            LOGGER.error('Something went wrong when sending the email: %s', ex)
Example #5
0
def main():
    """
        Spamhaus blacklisted ip extracting tool entry point.
    """
    LOGGER.info("Started...")

    # Read from stdin data
    buf = []
    for line in sys.stdin:
        buf.append(line)

    content = '\n'.join(buf)

    LOGGER.info("Parsing html (%d bytes)", len(content))
    documents = parse_html(content)
    LOGGER.info("%d spamhaus entries found.", len(documents))

    LOGGER.info("Updating database.")
    update_db(documents)

    LOGGER.info("Done.")
Example #6
0
    def run(self):
        """
            Run the parser.
        """
        with mongo.Mongo() as database:
            current = self.next()
            while current:
                try:
                    addr = self.get_ip(current)
                    if not addr:
                        LOGGER.info('Entry skipped because no specified IP.')
                        current = self.next()
                        continue

                    if not utils.is_managed_ip(addr):
                        LOGGER.debug('Not a managed IP [%s].', addr)
                        current = self.next()
                        continue

                    doc_ts = int(time.mktime(self.get_date(current).timetuple()))
                    if doc_ts < YESTERDAY:
                        LOGGER.debug('This entry is too old [%s].', self.get_date(current))
                        current = self.next()
                        continue

                    document = {
                        'ip': addr,
                        'timestamp': doc_ts,
                        'weight': self.compute_weight(current),
                        'source': self.get_source(current),
                        'raw': self.get_raw(current)
                    }
                    database.push_ip_document(document)
                except Exception as exc:
                    LOGGER.error('Unexpected error: %s [%s]', type(exc), exc.message)
                    LOGGER.error(traceback.format_exc())

                current = self.next()
            self.close()
    def push_ip_document(self, input_dict):
        """
            Push a new document regarding an IP or update existing document to
            append new data.

            :param dict input_dict: Expect a dictionnary having at least those
                fields: [IP, filename, weight, source, timestamp, raw]
        """
        file_doc = self._build_file_document(input_dict)
        input_dict['filename'] = file_doc['filename']

        if self.does_ip_exist(input_dict['ip']):
            LOGGER.debug('IP [%s] already exists. Update...', input_dict['ip'])
            self._ip_collection.update(
                {'ip': input_dict['ip']},
                {'$push': {'events': self._build_event_document(input_dict)}}
            )
        else:
            LOGGER.debug('Brand new IP [%s]. Insert...', input_dict['ip'])
            doc = self._build_full_document(input_dict)
            self._ip_collection.save(doc)
            self._ip_cache.append(input_dict['ip'])

        self._raw_collection.save(file_doc)
    def get_reader_for_mail(raw):
        """
            Automatically detect the appropriate reader that will be able to
            read the passed e-mail. This method is static.

            :param str raw: The raw e-mail content
            :rtype: Object
            :return: An instance of :py:class:`parsing.mails.mailreader.AbstractMailReader`
        """
        match = re.search(r'{}:\s(.*)'.format(settings.SCORING_EMAIL['partner_header']), raw)
        if not match:
            raise Exception('Malformed input mail :: missing header [{}]'.format(settings.SCORING_EMAIL['partner_header']))

        source = match.group(1).strip()

        LOGGER.debug('Mail from %s', source)
        if source in ("AOL", "SignalSpam"):
            return arf.ArfReader(raw, source)
        elif source == "SpamCop":
            return spamcop.SpamcopReader(raw)

        raise Exception(
            'Malformed input mail :: unknown value [{}] for header [{}]'.format(source, settings.SCORING_EMAIL['partner_header'])
        )
Example #9
0
def validate_videoQA(model, val_loader, split, task="tvqa", save_logits=False):
    LOGGER.info(f"start running validation on {task} {split} split...")
    model.eval()
    val_loss = 0
    n_ex = 0
    tot_score = 0
    results = {}
    logits = {}
    val_log = {}
    st = time()
    has_gt_target = True
    for i, batch in enumerate(val_loader):
        targets = batch['targets']
        if has_gt_target and targets.min() < 0:
            has_gt_target = False
            LOGGER.info(
                "No GT annotations provided, only generate predictions")
        if 'qids' in batch:
            qids = batch['qids']
            del batch['qids']

        scores = model(batch, task, compute_loss=False)
        answers = [
            i for i in scores.max(dim=-1, keepdim=False)[1].cpu().tolist()
        ]
        for qid, answer in zip(qids, answers):
            results[str(qid)] = answer
        if save_logits:
            scores = scores.cpu().tolist()
            for qid, logit in zip(qids, scores):
                logits[str(qid)] = logit

        if has_gt_target:
            loss = F.cross_entropy(scores,
                                   targets.squeeze(-1),
                                   reduction='sum')
            val_loss += loss.item()
            tot_score += compute_accuracies(scores, targets)
            n_ex += len(qids)

    if has_gt_target:
        val_loss = sum(all_gather_list(val_loss))
        tot_score = sum(all_gather_list(tot_score))
        n_ex = sum(all_gather_list(n_ex))
        tot_time = time() - st
        val_loss /= n_ex
        val_acc = tot_score / n_ex
        val_log = {
            'valid/loss': val_loss,
            'valid/acc': val_acc,
            'valid/ex_per_s': n_ex / tot_time
        }
        LOGGER.info(f"validation finished in {int(tot_time)} seconds, "
                    f"loss:{val_loss:.2f}, score: {val_acc*100:.2f}")
    model.train()
    return val_log, results, logits
Example #10
0
def validate_mlm(model, val_loader):
    LOGGER.info("start running MLM validation...")
    val_loss = 0
    n_correct = 0
    n_word = 0
    st = time()
    for i, batch in enumerate(val_loader):
        scores = model(batch, task='mlm', compute_loss=False)
        labels = batch['txt_labels']
        labels = labels[labels != -1]
        loss = F.cross_entropy(scores, labels, reduction='sum')
        val_loss += loss.item()
        n_correct += (scores.max(dim=-1)[1] == labels).sum().item()
        n_word += labels.numel()
    val_loss = sum(all_gather_list(val_loss))
    n_correct = sum(all_gather_list(n_correct))
    n_word = sum(all_gather_list(n_word))
    tot_time = time() - st
    val_loss /= n_word
    acc = n_correct / n_word
    val_log = {'loss': val_loss, 'acc': acc, 'tok_per_s': n_word / tot_time}
    LOGGER.info(f"validation finished in {int(tot_time)} seconds, "
                f"acc: {acc*100:.2f}")
    return val_log
    def export_val_predictions(self, test=False, test_idx=0, threshold=0.5):
        batch_loader = self.config['val_loader'] if not test else self.config[
            'test_loader'][test_idx]
        test_name = batch_loader.dataset.name
        LOGGER.info("Exporting %s predictions..." % (test_name))
        self.model.eval()

        ## Step 1: Find the optimal threshold on validation set
        _, _ = self.eval_model(test=test, test_idx=test_idx)
        val_probs = torch.tensor(self.probs_list)
        val_labels = torch.tensor(self.labels_list)
        if len(self.id_list) != 0:
            val_ids = torch.tensor(self.id_list)
        else:
            val_ids = torch.zeros_like(val_labels) - 1
        val_preds = (val_probs > threshold).long()

        self._export_preds(val_ids,
                           val_probs,
                           val_preds,
                           labels=val_labels,
                           file_postfix="_%s_preds.csv" % test_name)

        LOGGER.info("Finished export of %s predictions" % test_name)
Example #12
0
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None):
    # opts.conf_th : 0.2
    # opts.min_bb : 10
    # opts.num_bb 36
    if all_img_dbs is None:
        all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb,
                                     opts.num_bb, opts.compressed_db)
    dataloaders = {}
    for dset in datasets:
        if is_train:
            assert len(dset['db']) == len(dset['img'])
            assert len(dset['tasks']) == len(dset['mix_ratio'])
            img_db = [all_img_dbs[path] for path in dset['img']]
        else:
            assert len(dset['db']) == len(dset['img']) == 1
            img_db = all_img_dbs[dset['img'][0]]

        for i, t in enumerate(dset['tasks']):
            task = f'{t}_{dset["name"]}'

            if is_train:
                LOGGER.info(f"Loading {task} train dataset "
                            f"{dset['db']}, {[img.img_dir for img in img_db]}")
                txt_db = [
                    TxtTokLmdb(path, opts.max_txt_len) for path in dset['db']
                ]
            else:
                LOGGER.info(f"Loading {task} validation dataset, "
                            f"{dset['db']}, {img_db.img_dir}")
                txt_db = TxtTokLmdb(dset['db'][0], -1)

            if task.startswith('mlm'):
                dataset = build_mlm_dataset(txt_db, img_db, is_train, opts)
            elif task.startswith('mrfr'):
                dataset = build_mrfr_dataset(txt_db, img_db, is_train, opts)
            elif task.startswith('mrc'):
                dataset = build_mrc_dataset(txt_db, img_db, is_train, opts)
            elif task.startswith('itm'):
                dataset = build_itm_dataset(txt_db, img_db, is_train, opts)
            else:
                raise ValueError(f'Undefined task {task}')

            LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded")
            if task.startswith('itm'):
                # itm handles distributed training in dset not sampler
                loader = build_dataloader_itm(*dataset, is_train, opts)
            else:
                loader = build_dataloader(*dataset, is_train, opts)
            if is_train:
                ratio = dset['mix_ratio'][i]
                dataloaders[task] = (loader, ratio)
            else:
                dataloaders[task] = PrefetchLoader(loader)
    return dataloaders, all_img_dbs
Example #13
0
def obtain_system_token():
    LOGGER.info('Trying to read token from file: %s', CONFIG.TOKEN_FILE)
    token_file = open(CONFIG.TOKEN_FILE, 'r')
    file_content = token_file.read().splitlines()

    if len(file_content) >= 2 and file_content[0] == TOKEN_START and file_content[2] == TOKEN_END:
        LOGGER.info('Reading token from file succeeded')
        return file_content[1]
    else:
        LOGGER.warn('Reading token from file failed')
        return aquire_token()
Example #14
0
def create_dataloaders(datasets, is_train, opts, all_img_dbs=None):
    all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb,
                                 opts.num_bb, opts.compressed_db)
    dataloaders = {}
    for dset in datasets:
        if is_train:
            txt_path = opts.train_txt_dbs
            img_path = opts.train_img_dbs
        else:
            txt_path = opts.val_txt_dbs
            img_path = opts.val_img_dbs

        for i, t in enumerate(dset['tasks']):
            task = f'{t}_{dset["name"]}'
            if is_train:
                LOGGER.info(f"Loading {task} train dataset "
                            f"{dset['db']}, {dset['img']}")
            else:
                LOGGER.info(f"Loading {task} validation dataset, "
                            f"{dset['db']}, {dset['img']}")

            if task.startswith('mlm'):
                dataset = build_mlm_dataset(txt_path, img_path, all_img_dbs,
                                            is_train, opts)
            elif task.startswith('mrfr'):
                dataset = build_mrfr_dataset(txt_path, img_path, all_img_dbs,
                                             is_train, opts)
            elif task.startswith('mrckl'):
                dataset = build_mrc_dataset(txt_path, img_path, all_img_dbs,
                                            is_train, opts)
            elif task.startswith('itm'):
                dataset = build_itm_dataset(txt_path, img_path, all_img_dbs,
                                            is_train, opts)
            elif task.startswith('itkm'):
                dataset = build_itkm_dataset(txt_path, img_path, all_img_dbs,
                                             is_train, opts)
            elif task.startswith('mkm'):
                dataset = build_mkm_dataset(txt_path, img_path, all_img_dbs,
                                            is_train, opts)
            else:
                raise ValueError(f'Undefined task {task}')

            LOGGER.info(f"{len(dataset[0])*hvd.size()} samples loaded")
            if task.startswith('itm'):
                # itm handles distributed training in dset not sampler
                loader = build_dataloader_itm(*dataset, is_train, opts)
            else:
                loader = build_dataloader(*dataset, is_train, opts)
            if is_train:
                ratio = dset['mix_ratio'][i]
                dataloaders[task] = (loader, ratio)
            else:
                dataloaders[task] = PrefetchLoader(loader)
    return dataloaders, all_img_dbs
Example #15
0
def aquire_token():
    LOGGER.info('Trying to aquire token from server')
    token_file = open(CONFIG.TOKEN_FILE, 'w')
    req = requests.put(CONFIG.SERVER_ADDR + TOKENS_RES, json={'system_name': CONFIG.SYSTEM_NAME})
    if req.ok:
        LOGGER.info('Aquiring token from server succeeded')
        generated_token = req.json()['generated_token']
        token_file.write('\n'.join([TOKEN_START, generated_token, TOKEN_END]))
        return generated_token
    else:
        LOGGER.error('Failed to aquire token from server')
        raise TokenNotFoundException()
Example #16
0
def main():
    args = get_args()
    helper.print_script_args_and_info(args)
    os.makedirs(args.embeddings_result_folder, exist_ok=True)

    LOGGER.info('Loading pre-trained embedding')

    LOGGER.info('Starting to process datasets')
    Parallel(n_jobs=args.n_jobs)(
        delayed(process_dataset)(dataset_name, args)
        for dataset_name in dataset_helper.get_dataset_names_with_concept_map(
            limit_datasets=args.limit_dataset))
    LOGGER.info('Finished')
Example #17
0
    def purge_old_documents(self):
        """
            Archive IP sub-documents older than a month ago. These documents are moved
            into a dedicated archiving collection.
        """
        total_count = 0
        request = {
            'events.timestamp': {
                '$lt': A_MONTH_AGO
            }
        }

        LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO)
        for doc in self._ip_collection.find(request):
            archives_bulk = []

            for event in doc['events']:
                # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved.
                # This condition removes subdocuments that do not match.
                if event['timestamp'] < A_MONTH_AGO:
                    archives_bulk.append({
                        'ip': doc['ip'],
                        'filename': event['filename'],
                        'source': event['source'],
                        'weight': event['weight'],
                        'timestamp': event['timestamp']
                    })

            result = self._archive_collection.insert(archives_bulk)
            total_count = total_count + len(result)

        result = self._ip_collection.update(request, {
            '$pull': {
                'events': {
                    'timestamp': {
                        '$lt': A_MONTH_AGO
                    }
                }
            }
        }, multi=True)
        LOGGER.info('%d documents archived.', total_count)

        # Remove single entries
        result = self._ip_collection.remove({
            'events.timestamp': {
                '$exists': False
            }
        }, multi=True)

        LOGGER.info('%d single entries have been removed.', result['n'])
Example #18
0
    def purge_old_documents(self):
        """
            Archive IP sub-documents older than a month ago. These documents are moved
            into a dedicated archiving collection.
        """
        total_count = 0
        request = {'events.timestamp': {'$lt': A_MONTH_AGO}}

        LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO)
        for doc in self._ip_collection.find(request):
            archives_bulk = []

            for event in doc['events']:
                # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved.
                # This condition removes subdocuments that do not match.
                if event['timestamp'] < A_MONTH_AGO:
                    archives_bulk.append({
                        'ip': doc['ip'],
                        'filename': event['filename'],
                        'source': event['source'],
                        'weight': event['weight'],
                        'timestamp': event['timestamp']
                    })

            result = self._archive_collection.insert(archives_bulk)
            total_count += len(result)

        self._ip_collection.update(
            request,
            {'$pull': {
                'events': {
                    'timestamp': {
                        '$lt': A_MONTH_AGO
                    }
                }
            }},
            multi=True)
        LOGGER.info('%d documents archived.', total_count)

        # Remove single entries
        result = self._ip_collection.remove(
            {'events.timestamp': {
                '$exists': False
            }}, multi=True)

        LOGGER.info('%d single entries have been removed.', result['n'])
Example #19
0
def get_response(req_type="POST",
                 url=None,
                 data=None,
                 headers=DEFAULT_HEADER,
                 cookies=COOKIES):
    """ http请求 """
    LOGGER.info(url + " " + str(data) + " " + str(COOKIES))
    try:
        if req_type.upper() == "POST":

            r = requests.post(url=url,
                              data=data,
                              headers=headers,
                              allow_redirects=True,
                              cookies=cookies)

        elif req_type.upper() == "GET":
            param_list = []
            for key, value in data.items():
                param_list.append(key + "=" + value)
            r = requests.get(url=url + "?" + "&".join(param_list),
                             data={},
                             headers=headers,
                             allow_redirects=True,
                             cookies=cookies)
        else:
            raise TypeError("http method error")
    except (requests.exceptions.ConnectionError, TypeError) as e:
        LOGGER.error("send request fail " + str(e))
        return None

    if r.status_code == requests.codes.ok:
        # LOGGER.info(r.text)
        # 更新cookies
        if len(r.cookies) != 0:
            COOKIES.update(r.cookies)
        for res in r.history:
            if len(res.cookies) != 0:
                COOKIES.update(res.cookies)

        return r.text
    else:
        LOGGER.error("status code " + str(r.status_code))
        return None
Example #20
0
def evaluate(model, eval_loader):
    LOGGER.info("start running evaluation...")
    model.eval()
    tot_score = 0
    n_ex = 0
    st = time()
    predictions = []
    for i, batch in enumerate(eval_loader):
        (tgt_box_list, obj_boxes_list, sent_ids) = (
            batch['tgt_box'], batch['obj_boxes'], batch['sent_ids'])
        # scores (n, max_num_bb)
        scores = model(batch, compute_loss=False)
        ixs = torch.argmax(scores, 1).cpu().detach().numpy()  # (n, )

        # pred_boxes
        for ix, obj_boxes, tgt_box, sent_id in \
                zip(ixs, obj_boxes_list, tgt_box_list, sent_ids):
            pred_box = obj_boxes[ix]
            predictions.append({'sent_id': int(sent_id),
                                'pred_box': pred_box.tolist(),
                                'tgt_box': tgt_box.tolist()})
            if eval_loader.loader.dataset.computeIoU(pred_box, tgt_box) > .5:
                tot_score += 1
            n_ex += 1
        if i % 100 == 0 and hvd.rank() == 0:
            n_results = len(predictions)
            n_results *= hvd.size()   # an approximation to avoid hangs
            LOGGER.info(f'{n_results}/{len(eval_loader.dataset)} '
                        'answers predicted')
    n_ex = sum(all_gather_list(n_ex))
    tot_time = time()-st
    tot_score = sum(all_gather_list(tot_score))
    val_acc = tot_score / n_ex
    val_log = {'valid/acc': val_acc, 'valid/ex_per_s': n_ex/tot_time}
    model.train()
    LOGGER.info(f"validation ({n_ex} sents) finished in"
                f" {int(tot_time)} seconds"
                f", accuracy: {val_acc*100:.2f}%")
    # summarizae
    results = {'acc': val_acc, 'predictions': predictions}
    return val_log, results
Example #21
0
File: views.py Project: wohui/pxiv
 def get_hito_data(request):
     if request.method == 'POST':
         # 获取一言
         hitoko_url = "https://api.imjad.cn/hitokoto"
         hitoko_data = {
             "c": "c",
             "encode": "json",
             "charset": "utf-8",
             "length": 50,
         }
         data = []
         for i in range(0, 10):
             hitoko_res = requests.get(hitoko_url,
                                       hitoko_data,
                                       verify=False)
             hitoko_res_text = str(hitoko_res.text)
             json_obj = json.loads(hitoko_res_text)
             if json_obj:
                 hito_text = json_obj['hitokoto']
                 LOGGER.info(hito_text)
                 data.append(hito_text)
         res = {}
         if data:
             res['msg'] = "获取一言完成"
             res['status'] = True
             LOGGER.info("获取一言完成")
         else:
             res['msg'] = "获取一言失败"
             res['status'] = False
             LOGGER.error("获取一言失败")
         response_data = {
             "msg": res['msg'],
             "status": res['status'],
             "data": data
         }
         return JsonResponse(response_data)
Example #22
0
def validate(loader, generator, tokenizer, evaluator):
    st = time()
    generator.model.eval()
    results = []
    for batch in loader:
        vids = batch['vid_names']
        cids = batch['clip_ids']
        all_ts = batch['all_ts']
        outputs = generator.greedy_decode(batch)
        for vid, cid, ts, out_ids in zip(vids, cids, all_ts, outputs):
            output = tokenizer.convert_tokens_to_string(
                tokenizer.convert_ids_to_tokens(out_ids))
            results.append({'vid_name': vid, 'clip_id': cid, 'ts': ts,
                            'descs': [{'desc': output}]})
    results = [r for rs in all_gather_list(results) for r in rs]
    LOGGER.info(f'decoding finished in {int(time() - st)} seconds')
    if hvd.rank() == 0:
        val_log = evaluator(results)
        LOGGER.info(f'Validation finished in {int(time() - st)} seconds')
        LOGGER.info(f'CIDEr: {val_log["CIDEr"]}')
    else:
        val_log = {}
    generator.model.train()
    return val_log, results
Example #23
0
    parser.add_argument('--race_gender_hidden_size',
                        type=int,
                        default=0,
                        help='Hidden size for race and gender')

    args, unparsed = parser.parse_known_args()
    config = args.__dict__
    wandb.config.update(config)
    config['device'] = get_device()
    config['n_classes'] = 2 if config['loss_func'] == 'ce' else 1

    # Check all provided paths:
    if not os.path.exists(config['data_path']):
        raise ValueError("[!] ERROR: Dataset path does not exist")
    else:
        LOGGER.info("Data path checked..")
    if not os.path.exists(config['model_path']):
        LOGGER.warning(
            "Creating checkpoint path for saved models at:  {}\n".format(
                config['model_path']))
        os.makedirs(config['model_path'])
    else:
        LOGGER.info("Model save path checked..")
    if 'config' in config:
        if not os.path.isfile(config['config']):
            raise ValueError("[!] ERROR: config JSON path does not exist")
        else:
            LOGGER.info("config JSON path checked..")
    if not os.path.exists(config['vis_path']):
        LOGGER.warning(
            "Creating checkpoint path for Tensorboard visualizations at:  {}\n"
Example #24
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    hps_file = f"{opts.output_dir}/log/hps.json"
    model_opts = Struct(json.load(open(hps_file)))

    # train_examples = None
    ans2label_file = f"{opts.output_dir}/ckpt/ans2label.json"
    ans2label = json.load(open(ans2label_file))
    label2ans = {label: ans for ans, label in ans2label.items()}

    # load DBs and image dirs
    eval_img_db = DetectFeatLmdb(
        opts.img_db,
        model_opts.conf_th,
        model_opts.max_bb,
        model_opts.min_bb,
        model_opts.num_bb,
        opts.compressed_db,
    )
    eval_txt_db = TxtTokLmdb(opts.txt_db, -1)
    eval_dataset = VqaEvalDataset(len(ans2label), eval_txt_db, eval_img_db)

    # Prepare model
    if exists(opts.checkpoint):
        ckpt_file = opts.checkpoint
    else:
        ckpt_file = f"{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt"
    checkpoint = torch.load(ckpt_file)
    model = UniterForVisualQuestionAnswering.from_pretrained(
        f"{opts.output_dir}/log/model.json",
        checkpoint,
        img_dim=IMG_DIM,
        num_answer=len(ans2label),
    )
    model.to(device)
    if opts.fp16:
        model = amp.initialize(model, enabled=True, opt_level="O2")

    sampler = TokenBucketSampler(
        eval_dataset.lens,
        bucket_size=BUCKET_SIZE,
        batch_size=opts.batch_size,
        droplast=False,
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        batch_sampler=sampler,
        num_workers=opts.n_workers,
        pin_memory=opts.pin_mem,
        collate_fn=vqa_eval_collate,
    )
    eval_dataloader = PrefetchLoader(eval_dataloader)

    val_log, results, logits = evaluate(model, eval_dataloader, label2ans,
                                        opts.save_logits)
    result_dir = f"{opts.output_dir}/results_test"
    if not exists(result_dir) and rank == 0:
        os.makedirs(result_dir)

    all_results = list(concat(all_gather_list(results)))
    if opts.save_logits:
        all_logits = {}
        for id2logit in all_gather_list(logits):
            all_logits.update(id2logit)
    if hvd.rank() == 0:
        with open(f"{result_dir}/"
                  f"results_{opts.checkpoint}_all.json", "w") as f:
            json.dump(all_results, f)
        if opts.save_logits:
            np.savez(f"{result_dir}/logits_{opts.checkpoint}_all.npz",
                     **all_logits)
Example #25
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    opts.n_gpu = n_gpu
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if hvd.rank() != 0:
        LOGGER.disabled = True
    set_random_seed(opts.seed)

    # train_examples = None
    LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, "
                f"{opts.vfeat_db}")
    if opts.task != "didemo_video_only":
        video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db,
                                          opts.vfeat_interval, opts)
    else:
        txt_meta = load_json(join(opts.train_query_txt_db, "meta.json"))
        video_db = load_video_only_dataset(opts.vfeat_db, txt_meta,
                                           opts.vfeat_interval, opts)

    # data loaders
    # train
    video_ids = get_video_ids(opts.train_query_txt_db)
    train_q_txt_db = QueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len)
    train_dataloaders = build_downstream_dataloaders([opts.task],
                                                     video_db,
                                                     video_ids,
                                                     True,
                                                     opts,
                                                     shuffle=True,
                                                     q_txt_db=train_q_txt_db)
    meta_loader = MetaLoader(train_dataloaders,
                             accum_steps=opts.gradient_accumulation_steps,
                             distributed=n_gpu > 1)
    meta_loader = PrefetchLoader(meta_loader)

    # val
    video_ids = get_video_ids(opts.val_query_txt_db)
    val_q_txt_db = QueryTokLmdb(opts.val_query_txt_db, -1)
    val_dataloaders = build_downstream_dataloaders([opts.task],
                                                   video_db,
                                                   video_ids,
                                                   False,
                                                   opts,
                                                   q_txt_db=val_q_txt_db)

    if opts.task != "didemo_video_only":
        inf_dataset = VcmrFullEvalDataset
    else:
        inf_dataset = VcmrVideoOnlyFullEvalDataset
    LOGGER.info(f"Loading Inference Dataset {opts.val_query_txt_db} (val)")
    val_dset = inf_dataset(video_ids,
                           video_db,
                           val_q_txt_db,
                           distributed=opts.distributed_eval)
    inf_loader_val = DataLoader(val_dset,
                                batch_size=opts.vcmr_eval_q_batch_size,
                                num_workers=opts.n_workers,
                                pin_memory=opts.pin_mem,
                                collate_fn=vcmr_full_eval_collate)
    inf_loader_val = PrefetchLoader(inf_loader_val)
    if opts.test_query_txt_db:
        LOGGER.info(
            f"Loading Inference Dataset {opts.test_query_txt_db} (test)")
        video_ids = get_video_ids(opts.test_query_txt_db)
        test_q_txt_db = QueryTokLmdb(opts.test_query_txt_db, -1)
        test_dset = inf_dataset(video_ids,
                                video_db,
                                test_q_txt_db,
                                distributed=opts.distributed_eval)
        inf_loader_test = DataLoader(test_dset,
                                     batch_size=opts.vcmr_eval_q_batch_size,
                                     num_workers=opts.n_workers,
                                     pin_memory=opts.pin_mem,
                                     collate_fn=vcmr_full_eval_collate)
        inf_loader_test = PrefetchLoader(inf_loader_test)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}
    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    if img_pos_embed_weight_key in checkpoint:
        max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])
    else:
        max_frm_seq_len = MAX_FRM_SEQ_LEN

    model = HeroForVcmr.from_pretrained(
        opts.model_config,
        state_dict=checkpoint,
        vfeat_dim=VFEAT_DIM,
        max_frm_seq_len=max_frm_seq_len,
        lw_neg_ctx=opts.lw_neg_ctx,
        lw_neg_q=opts.lw_neg_q,
        lw_st_ed=0,
        ranking_loss_type=opts.ranking_loss_type,
        use_hard_negative=False,
        hard_pool_size=opts.hard_pool_size,
        margin=opts.margin,
        use_all_neg=opts.use_all_neg,
        drop_svmr_prob=opts.drop_svmr_prob)

    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())}
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      num_losses=len(task2scaler),
                                      enabled=opts.fp16,
                                      opt_level='O2')
    restorer = TrainingRestorer(opts, model, optimizer)
    global_step = restorer.global_step
    TB_LOGGER.global_step = global_step
    if hvd.rank() == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        if not exists(join(opts.output_dir, 'results')):
            # store tvr predictions
            os.makedirs(join(opts.output_dir, 'results'))
        if opts.nms_thd != -1:
            # store tvr-nms predictions
            if not exists(join(opts.output_dir, 'results_nms')):
                os.makedirs(join(opts.output_dir, 'results_nms'))
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        pbar = NoOp()
        model_saver = NoOp()
        restorer = NoOp()

    if global_step > 0:
        pbar.update(global_step)
    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    task2loss = {
        task: RunningMeter(f'loss/{task}')
        for task in train_dataloaders.keys()
    }

    for obj in (f'{opts.task}_st_ed', f'{opts.task}_neg_ctx',
                f'{opts.task}_neg_q'):
        task2loss[obj] = RunningMeter(f'loss/{obj}')
    model.train()
    n_examples = defaultdict(int)
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    if global_step == 0:
        optimizer.step()
    for step, (task, batch) in enumerate(meta_loader):
        if len(opts.hard_negtiave_start_step) > 0:
            for i, hn_step in enumerate(opts.hard_negtiave_start_step):
                if global_step >= hn_step and hn_step != -1:
                    model.set_hard_negative(True, opts.hard_pool_size[i],
                                            opts.hard_neg_weights[i])
        if opts.train_span_start_step != -1 and\
                global_step >= opts.train_span_start_step:
            model.set_train_st_ed(opts.lw_st_ed)

        n_examples[task] += opts.train_batch_size

        loss = model(batch, task=task, compute_loss=True)

        loss_st_ed, loss_neg_ctx, loss_neg_q = loss
        loss = loss_st_ed + loss_neg_ctx + loss_neg_q
        for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed),
                         ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx),
                         ('neg_q', loss_neg_q, opts.lw_neg_q)):
            ls = ls.item()
            if w:
                ls /= w
            task2loss[f'{task}_{n}'](ls)

        loss = loss.mean()
        task2loss[task](loss.item())

        delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
        with amp.scale_loss(loss,
                            optimizer,
                            delay_unscale=delay_unscale,
                            loss_id=task2scaler[task]) as scaled_loss:
            scaled_loss.backward()
            if not delay_unscale:
                # gather gradients from every processes
                # do this before unscaling to make sure every process uses
                # the same gradient scale
                grads = [
                    p.grad.data for p in model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                all_reduce_and_rescale_tensors(grads, float(1))

        if (step + 1) % opts.gradient_accumulation_steps == 0:
            global_step += 1

            # learning rate scheduling
            lr_this_step = get_lr_sched(global_step, opts)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

            # log loss
            TB_LOGGER.log_scaler_dict({
                temp_loss.name: temp_loss.val
                for temp_loss in task2loss.values()
                if temp_loss.val is not None
            })
            TB_LOGGER.step()

            # update model params
            if opts.grad_norm != -1:
                grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                            opts.grad_norm)
                TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
            optimizer.step()
            optimizer.zero_grad()
            pbar.update(1)

            if global_step % 100 == 0:
                # monitor training throughput
                LOGGER.info('-------------------------------------------')
                LOGGER.info(f'Step {global_step}:')
                for t in train_dataloaders.keys():
                    tot_ex = sum(all_gather_list(n_examples[t]))
                    ex_per_sec = int(tot_ex / (time() - start))
                    LOGGER.info(f'{t}: {tot_ex} examples trained at '
                                f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec,
                                         global_step)

            if global_step % opts.valid_steps == 0:
                LOGGER.info('===========================================')
                LOGGER.info(f"Step {global_step}: start running validation")
                validate(model, val_dataloaders, opts)
                if hvd.rank() == 0 or opts.distributed_eval:
                    log, results = validate_full_vcmr(model,
                                                      inf_loader_val,
                                                      'val',
                                                      opts,
                                                      model_opts=opts)
                    save_json(
                        results, f'{opts.output_dir}/results/'
                        f'val_results_{global_step}_rank{hvd.rank()}.json')
                    TB_LOGGER.log_scaler_dict(log)
                    if opts.test_query_txt_db:
                        log, results = validate_full_vcmr(model,
                                                          inf_loader_test,
                                                          'test',
                                                          opts,
                                                          model_opts=opts)
                        save_json(
                            results, f'{opts.output_dir}/results/'
                            f'test_results_{global_step}_rank{hvd.rank()}.json'
                        )
                        TB_LOGGER.log_scaler_dict(log)
                LOGGER.info('===========================================')
                model_saver.save(model, global_step)

            # step restorer in the end to prevent missing validation checkpoint
            restorer.step()
        if global_step >= opts.num_train_steps:
            break

    LOGGER.info('===========================================')
    if global_step % opts.valid_steps != 0:
        if hvd.rank() == 0 or opts.distributed_eval:
            log, results = validate_full_vcmr(model,
                                              inf_loader_val,
                                              'val',
                                              opts,
                                              model_opts=opts)
            save_json(
                results, f'{opts.output_dir}/results/'
                f'val_results_{global_step}'
                f'_rank{hvd.rank()}_final.json')
            TB_LOGGER.log_scaler_dict(log)
            if opts.test_query_txt_db:
                log, results = validate_full_vcmr(model,
                                                  inf_loader_test,
                                                  'test',
                                                  opts,
                                                  model_opts=opts)
                save_json(
                    results, f'{opts.output_dir}/results/'
                    f'test_results_{global_step}_rank{hvd.rank()}.json')
                TB_LOGGER.log_scaler_dict(log)
    model_saver.save(model, f'{global_step}_final')
Example #26
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank

    
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(
                    device, n_gpu, hvd.rank(), opts.fp16))
    device = torch.device("cuda:1")
    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                            opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)

    if hvd.rank() == 0:
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        os.makedirs(join(opts.output_dir, 'ckpt'))
        save_training_meta(opts)
        # TB_LOGGER.create(join(opts.output_dir, 'log'))
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
        # store ITM predictions
        os.makedirs(join(opts.output_dir, 'results_val'))
        os.makedirs(join(opts.output_dir, 'results_test'))
        os.makedirs(join(opts.output_dir, 'results_train'))
    else:
        LOGGER.disabled = True
        model_saver = NoOp()

    # load DBs and image dirs
    all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb,
                                 opts.num_bb, opts.compressed_db)
    # train
    train_dataset = MemeAIDataset(json_path = '/home/data/meme_json/train.json',
                                    npz_folder = '/home/data/faster_cnn_feature/', 
                                    mode = 'train')
    train_loader =  DataLoader(train_dataset, 
                                    batch_size = opts.train_batch_size, 
                                    shuffle = True, 
                                    num_workers = opts.n_workers,
                                    collate_fn=collate_fn)
    train_loader = PrefetchLoader(train_loader)

    # val
    val_dataset = MemeAIDataset(json_path = '/home/data/meme_json/dev.json',
                                    npz_folder = '/home/data/faster_cnn_feature/', 
                                    mode = 'val')
    val_loader =  DataLoader(val_dataset, 
                                    batch_size = opts.inf_minibatch_size, 
                                    shuffle = False, 
                                    num_workers = opts.n_workers,
                                    collate_fn=collate_fn)
    val_loader = PrefetchLoader(val_loader)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}

    model = Meme.from_pretrained(
        opts.model_config, state_dict=checkpoint,
        img_dim=IMG_DIM)
    model.init_output()  # pretrain ITM head is different from ranking head
    model.to(device)

    # make sure every process has same model parameters in the beginning
    # broadcast_tensors([p.data for p in model.parameters()], 0)
    # set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    model, optimizer = amp.initialize(model, optimizer,
                                      enabled=opts.fp16, opt_level='O2')

    global_step = 0
    # LOGGER.info(f"***** Running training on {n_gpu} GPUs *****")
    # LOGGER.info("  Num examples = %d", len(train_dataset) * hvd.size())
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    running_loss = RunningMeter('loss')
    model.train()

    n_examples = 0
    n_epoch = 0
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    # while True:
    for epoch in range(opts.epoch):
        print('epoch {}/ {}'.format(epoch, opts.epoch))
        pbar = tqdm(total=len(train_loader))

        model.train()
        preds = None
        gt = None

        for step, batch in enumerate(train_loader):
            x = batch[0]
            y = batch[1]
            n_examples += x['input_ids'].size(0)

            pred = model(x)

            if preds is None:

                preds = torch.sigmoid(pred)
                gt = y
            else:
                preds = torch.cat((preds, torch.sigmoid(pred)), dim = 0)
                gt = torch.cat((gt, y), dim = 0)


            loss = F.binary_cross_entropy(torch.sigmoid(pred), y)

            delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0
            with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale
                                ) as scaled_loss:
                scaled_loss.backward()
                if not delay_unscale:
                    # gather gradients from every processes
                    # do this before unscaling to make sure every process uses
                    # the same gradient scale
                    grads = [p.grad.data for p in model.parameters()
                             if p.requires_grad and p.grad is not None]
                    all_reduce_and_rescale_tensors(grads, float(1))

            running_loss(loss.item())

            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1

                # learning rate scheduling
                lr_this_step = get_lr_sched(global_step, opts)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

                # log loss
                # NOTE: not gathered across GPUs for efficiency
                TB_LOGGER.add_scalar('loss', running_loss.val, global_step)
                TB_LOGGER.step()

                # update model params
                if opts.grad_norm != -1:
                    grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                                opts.grad_norm)
                    TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
                optimizer.step()
                optimizer.zero_grad()


        global_step += 1

        # learning rate scheduling
        lr_this_step = get_lr_sched(global_step, opts)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_this_step
        TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

        # log loss
        # NOTE: not gathered across GPUs for efficiency
        TB_LOGGER.add_scalar('loss', running_loss.val, global_step)
        TB_LOGGER.step()

        # update model params
        if opts.grad_norm != -1:
            grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                        opts.grad_norm)
            TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
        optimizer.step()
        optimizer.zero_grad()

        with torch.no_grad():
            preds = preds.cpu().numpy().reshape(len(preds), )
            gt = gt.cpu().numpy()
            roc = roc_auc_score(gt, preds)
            acc = accuracy_score(gt, np.around(preds)) 
        train_log = {'train/roc': roc, 'train/acc': acc}
        TB_LOGGER.log_scaler_dict({f"train/{k}": v for k, v in train_log.items()})

        # monitor training throughput

        val_log = validate(model, val_loader)
        TB_LOGGER.log_scaler_dict({f"valid/{k}": v for k, v in val_log.items()})

        LOGGER.info(train_log)
        LOGGER.info(val_log)

        model_saver.save(model, global_step)

        pbar.close()
Example #27
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(
                    device, n_gpu, hvd.rank(), opts.fp16))

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                            opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)

    if rank == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(args.output_dir, 'ckpt'))
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()

    all_dbs = [db for datasets in [opts.train_datasets, opts.val_datasets]
               for dset in datasets for db in dset['db']]

    tokenizer = json.load(open(f'{all_dbs[0]}/meta.json'))['bert']
    assert all(tokenizer == json.load(open(f'{db}/meta.json'))['bert']
               for db in all_dbs)

    # build data loaders
    train_dataloaders, all_img_dbs = create_dataloaders(
        opts.train_datasets, True, opts)
    val_dataloaders, _ = create_dataloaders(
        opts.val_datasets, False, opts, all_img_dbs)
    meta_loader = MetaLoader(train_dataloaders,
                             accum_steps=opts.gradient_accumulation_steps,
                             distributed=n_gpu > 1)
    meta_loader = PrefetchLoader(meta_loader)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}
    model = UniterForPretraining.from_pretrained(
        opts.model_config, checkpoint,
        img_dim=IMG_DIM, img_label_dim=IMG_LABEL_DIM)
    model.to(device)
    model.train()
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())}
    model, optimizer = amp.initialize(model, optimizer,
                                      num_losses=len(task2scaler),
                                      enabled=opts.fp16, opt_level='O2')

    global_step = 0
    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    # to compute training statistics
    task2loss = {task: RunningMeter(f'loss/{task}')
                 for task in train_dataloaders.keys()}
    # ITM w/ OT
    if opts.itm_ot_lambda > 0:
        for task in train_dataloaders.keys():
            if task.startswith('itm'):
                task2loss[f'{task}_xe'] = RunningMeter(f'loss/{task}_xe')
                task2loss[f'{task}_ot'] = RunningMeter(f'loss/{task}_ot')
                task2loss[f'{task}_ot_pos'] = RunningMeter(
                    f'loss/{task}_ot_pos')
                task2loss[f'{task}_ot_neg'] = RunningMeter(
                    f'loss/{task}_ot_neg')

    n_examples = defaultdict(int)
    n_in_units = defaultdict(int)
    n_loss_units = defaultdict(int)
    grad_norm = 0

    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    for step, (name, batch) in enumerate(meta_loader):
        # forward pass
        n_examples[name] += batch['input_ids'].size(0)
        n_in_units[name] += (batch['attn_masks'] == 1).sum().item()
        task = name.split('_')[0]
        loss = model(batch, task=task, compute_loss=True)
        if task.startswith('itm'):
            # OT
            itm_loss, ot_loss = loss
            n_loss_units[name] += itm_loss.size(0)
            itm_loss = itm_loss.mean()
            if ot_loss is not None:
                ot_pos, ot_neg = ot_loss
                ot_loss = (ot_pos.sum() - ot_neg.sum()
                           ) / (ot_pos.size(0) + ot_neg.size(0))

                # NOTE: be ware of empty tensor
                ot_pos = ot_pos.mean().item()
                if not math.isnan(ot_pos):
                    task2loss[f'{name}_ot_pos'](ot_pos)
                ot_neg = ot_neg.mean().item()
                if not math.isnan(ot_neg):
                    task2loss[f'{name}_ot_neg'](ot_neg)

                loss = itm_loss + opts.itm_ot_lambda * ot_loss
                task2loss[f'{name}_xe'](itm_loss.item())
                task2loss[f'{name}_ot'](ot_loss.item())
            else:
                loss = itm_loss
        else:
            n_loss_units[name] += loss.size(0)
            loss = loss.mean()  # loss is not normalized in model

        # backward pass
        delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0
        with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale,
                            loss_id=task2scaler[name]) as scaled_loss:
            scaled_loss.backward()
            if not delay_unscale:
                # gather gradients from every processes
                # do this before unscaling to make sure every process uses
                # the same gradient scale
                grads = [p.grad.data for p in model.parameters()
                         if p.requires_grad and p.grad is not None]
                all_reduce_and_rescale_tensors(grads, float(1))
        task2loss[name](loss.item())

        # optimizer update and logging
        if (step + 1) % opts.gradient_accumulation_steps == 0:
            global_step += 1

            # learning rate scheduling
            lr_this_step = get_lr_sched(global_step, opts)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

            # log loss
            # NOTE: not gathered across GPUs for efficiency
            TB_LOGGER.log_scaler_dict({ll.name: ll.val
                                       for ll in task2loss.values()
                                       if ll.val is not None})
            TB_LOGGER.step()

            # update model params
            if opts.grad_norm != -1:
                grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                            opts.grad_norm)
                TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
            optimizer.step()
            optimizer.zero_grad()
            pbar.update(1)

            if global_step % 100 == 0:
                # monitor training throughput
                LOGGER.info(f'==============Step {global_step}===============')
                for t in train_dataloaders.keys():
                    assert all(tt == t for tt in all_gather_list(t))
                    tot_ex = sum(all_gather_list(n_examples[t]))
                    ex_per_sec = int(tot_ex / (time()-start))
                    tot_in = sum(all_gather_list(n_in_units[t]))
                    in_per_sec = int(tot_in / (time()-start))
                    tot_l = sum(all_gather_list(n_loss_units[t]))
                    l_per_sec = int(tot_l / (time()-start))
                    LOGGER.info(f'{t}: {tot_ex} examples trained at '
                                f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec,
                                         global_step)
                    TB_LOGGER.add_scalar(f'perf/{t}_in_per_s', in_per_sec,
                                         global_step)
                    TB_LOGGER.add_scalar(f'perf/{t}_loss_per_s', l_per_sec,
                                         global_step)
                LOGGER.info('===============================================')

            if global_step % opts.valid_steps == 0:
                LOGGER.info(f'Step {global_step}: start validation')
                validate(model, val_dataloaders)
                model_saver.save(model, global_step)
        if global_step >= opts.num_train_steps:
            break
    if global_step % opts.valid_steps != 0:
        LOGGER.info(f'Step {global_step}: start validation')
        validate(model, val_dataloaders)
        model_saver.save(model, global_step)
Example #28
0
File: itm.py Project: zmykevin/UC2
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                             opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)

    if hvd.rank() == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        os.makedirs(join(opts.output_dir, 'ckpt'), exist_ok=True)

        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
        # store ITM predictions
        os.makedirs(join(opts.output_dir, 'results_val'), exist_ok=True)
        os.makedirs(join(opts.output_dir, 'results_test'), exist_ok=True)
        os.makedirs(join(opts.output_dir, 'results_train'), exist_ok=True)
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()

    # train_examples = None
    LOGGER.info(f"Loading Train Dataset {opts.train_txt_dbs}, "
                f"{opts.train_img_dbs}")
    # check multiple DBs
    assert len(opts.train_txt_dbs) == len(opts.train_img_dbs), \
        "train txt_db and img_db have different length"

    # load DBs and image dirs
    all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb,
                                 opts.num_bb, opts.compressed_db)
    # train
    LOGGER.info(f"Loading Train Dataset "
                f"{opts.train_txt_dbs}, {opts.train_img_dbs}")
    train_datasets = []
    for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs):
        if "itm_coco_zh" not in txt_path:
            img_db = all_img_dbs[img_path]
            txt_db = TxtTokLmdb(txt_path, opts.max_txt_len)
            if opts.hard_neg_size > 0:
                train_datasets.append(
                    ItmRankDatasetHardNeg(txt_db, img_db, opts.negative_size,
                                          opts.hard_neg_size))
            else:
                train_datasets.append(
                    ItmRankDataset(txt_db, img_db, opts.negative_size))
        else:
            img_train_db = all_img_dbs[img_path[0]]
            img_val_db = all_img_dbs[img_path[1]]
            txt_db = TxtTokLmdb(txt_path, opts.max_txt_len)
            if opts.hard_neg_size > 0:
                train_datasets.append(
                    ItmRankDatasetHardNeg(txt_db, img_db, opts.negative_size,
                                          opts.hard_neg_size))
            else:
                train_datasets.append(
                    ItmRankDataset_COCO_CN(txt_db, img_train_db, img_val_db,
                                           opts.negative_size))
    train_dataset = ConcatDataset(train_datasets)

    # hard negative
    # hn_datasets = []
    # for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs):
    #     img_db = all_img_dbs[img_path]
    #     txt_db = TxtTokLmdb(txt_path, opts.max_txt_len)
    #     hn_datasets.append(ItmHardNegDataset(txt_db, img_db,
    #                                          opts.inf_minibatch_size))
    # hn_dataset = ConcatDataset(hn_datasets)
    # hn_dataloader = build_dataloader(hn_dataset, itm_hn_collate, False, opts)
    # hard_neg_dir = f'{opts.output_dir}/results_train/'

    # val
    LOGGER.info(f"Loading Val Dataset {opts.val_txt_db}, {opts.val_img_db}")
    val_img_db = all_img_dbs[opts.val_img_db[0]]
    val_txt_db = TxtTokLmdb(opts.val_txt_db[0], -1)
    val_dataset = ItmValDataset(val_txt_db, val_img_db,
                                opts.inf_minibatch_size)
    val_dataloader = build_dataloader(val_dataset, itm_val_collate, False,
                                      opts)
    # eval
    LOGGER.info(f"Loading val, test Dataset for full evaluation: "
                f"{opts.val_txt_db}, {opts.val_img_db}"
                f"{opts.test_txt_db}, {opts.test_img_db}")
    eval_dataset_val = ItmEvalDataset(val_txt_db, val_img_db,
                                      opts.inf_minibatch_size)
    eval_loader_val = build_dataloader(eval_dataset_val, itm_eval_collate,
                                       False, opts)

    eval_loader_list = []
    assert len(opts.test_img_db) == len(opts.test_txt_db)
    for test_img_db_path, test_txt_db_path in zip(opts.test_img_db,
                                                  opts.test_txt_db):
        if "itm_coco_zh" not in test_txt_db_path:
            test_img_db = all_img_dbs[test_img_db_path]
            test_txt_db = TxtTokLmdb(test_txt_db_path, -1)
            eval_dataset_test = ItmEvalDataset(test_txt_db, test_img_db,
                                               opts.inf_minibatch_size)
        else:
            test_img_train_db = all_img_dbs[test_img_db_path[0]]
            test_img_val_db = all_img_dbs[test_img_db_path[1]]
            test_txt_db = TxtTokLmdb(test_txt_db_path, -1)
            eval_dataset_test = ItmEvalDataset_COCO_CN(test_txt_db,
                                                       test_img_train_db,
                                                       test_img_val_db,
                                                       opts.inf_minibatch_size)
        eval_loader_test = build_dataloader(eval_dataset_test,
                                            itm_eval_collate, False, opts)
        eval_loader_list.append(eval_loader_test)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}

    #Rename the key if specified
    if opts.rename_checkpoints:
        rename_checkpoint(checkpoint)

    model = VLXLMRForImageTextRetrieval.from_pretrained(
        opts.model_config,
        state_dict=checkpoint,
        load_embedding_only=opts.load_embedding_only,
        load_layer=opts.load_layer,
        img_dim=IMG_DIM,
        margin=opts.margin)
    model.init_output()  # pretrain ITM head is different from ranking head
    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    if opts.separate_lr:
        optimizer = build_xlmr_optimizer(model, opts)
    else:
        optimizer = build_optimizer(model, opts)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=opts.fp16,
                                      opt_level='O2')

    #global_step = 0
    LOGGER.info(f"***** Running training on {n_gpu} GPUs *****")
    LOGGER.info("  Num examples = %d", len(train_dataset) * hvd.size())
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    running_loss = RunningMeter('loss')
    model.train()

    if opts.steps_per_hard_neg != -1:
        compute_hard_neg(model, hn_dataloader, train_dataset,
                         opts.hard_neg_pool_size, hard_neg_dir)

    #Initialize the TrainingRestorer
    restorer = TrainingRestorer(opts, model, optimizer)
    global_step = restorer.global_step
    TB_LOGGER._global_step = global_step
    if hvd.rank() != 0:
        restorer = NoOp()  #Added for Restoring the Checkpoints

    if global_step > 0:
        pbar.update(global_step)

    n_examples = 0
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    while True:
        train_dataloader = build_dataloader(train_dataset,
                                            xlmr_itm_rank_collate, True, opts)
        for step, batch in enumerate(train_dataloader):
            #print(batch['input_ids'])
            n_examples += batch['input_ids'].size(0)
            loss = model(batch, compute_loss=True)
            loss = loss.mean()
            delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
            with amp.scale_loss(loss, optimizer,
                                delay_unscale=delay_unscale) as scaled_loss:
                scaled_loss.backward()
                if not delay_unscale:
                    # gather gradients from every processes
                    # do this before unscaling to make sure every process uses
                    # the same gradient scale
                    grads = [
                        p.grad.data for p in model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    all_reduce_and_rescale_tensors(grads, float(1))

            running_loss(loss.item())
            # print("run the loss")
            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1

                # learning rate scheduling
                lr_this_step = get_lr_sched(global_step, opts)
                if opts.separate_lr:
                    #added by Mingyang
                    xlmr_lr_this_step = get_xlmr_lr_sched(global_step, opts)
                    for i, param_group in enumerate(optimizer.param_groups):
                        if i < 2:
                            param_group['lr'] = xlmr_lr_this_step
                        else:
                            param_group['lr'] = lr_this_step
                    TB_LOGGER.add_scalar('xlmr_lr', xlmr_lr_this_step,
                                         global_step)
                else:
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step

                TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

                # log loss
                losses = all_gather_list(running_loss)
                running_loss = RunningMeter(
                    'loss',
                    sum(l.val for l in losses) / len(losses))
                TB_LOGGER.add_scalar('loss', running_loss.val, global_step)
                TB_LOGGER.step()

                # update model params
                if opts.grad_norm != -1:
                    grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                                opts.grad_norm)
                    TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
                optimizer.step()
                optimizer.zero_grad()
                pbar.update(1)

                if global_step % 100 == 0:
                    # monitor training throughput
                    LOGGER.info(f'============Step {global_step}=============')
                    tot_ex = sum(all_gather_list(n_examples))
                    ex_per_sec = int(tot_ex / (time() - start))
                    LOGGER.info(f'{tot_ex} examples trained at '
                                f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec,
                                         global_step)
                    LOGGER.info(f'===========================================')

                if global_step % opts.valid_steps == 0 and global_step > 0:
                    # if global_step > 7000:
                    if opts.full_val:
                        val_log = evaluate(model, eval_loader_val)
                        TB_LOGGER.log_scaler_dict(
                            {f"valid/{k}": v
                             for k, v in val_log.items()})
                        #Log the information
                        # LOGGER.info(
                        #         f"========================= {split} ===========================\n"
                        #         f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n"
                        #         f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n"
                        #         f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n"
                        #         f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n"
                        #         f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n"
                        #         f"text retrieval R10: {eval_log['txt_r10']*100:.2f}")
                        # LOGGER.info("=========================================================")
                    else:
                        val_log = validate(model, val_dataloader)
                        TB_LOGGER.log_scaler_dict(val_log)

                    model_saver.save(model, global_step)
                restorer.step()
                if (opts.steps_per_hard_neg != -1
                        and global_step % opts.steps_per_hard_neg == 0):
                    # sample hard negatives for training
                    compute_hard_neg(model, hn_dataloader, train_dataset,
                                     opts.hard_neg_pool_size, hard_neg_dir)
                    # break to reconstruct loader
                    # for potential multi-worker issue (not sure)
                    break

            if global_step >= opts.num_train_steps:
                break

        if global_step >= opts.num_train_steps:
            break
        # NOTE can no longer count epochs

    pbar.close()
    # final validation
    # val_log = validate(model, val_dataloader)
    # TB_LOGGER.log_scaler_dict(val_log)
    model_saver.save(model, f'{global_step}_final')

    for i, loader in enumerate(eval_loader_list):
        split = "test_{}".format(i)
        eval_log = evaluate(model, loader)
        TB_LOGGER.log_scaler_dict(
            {f"eval/{split}_{k}": v
             for k, v in eval_log.items()})
        if hvd.rank() != 0:
            continue
        LOGGER.info(
            f"========================= {split} ===========================\n"
            f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n"
            f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n"
            f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n"
            f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n"
            f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n"
            f"text retrieval R10: {eval_log['txt_r10']*100:.2f}")
    LOGGER.info("=========================================================")
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))
    if hvd.rank() != 0:
        LOGGER.disabled = True
    hps_file = f'{opts.output_dir}/log/hps.json'
    model_opts = Struct(json.load(open(hps_file)))
    model_config = f'{opts.output_dir}/log/model_config.json'

    # load DBs and image dirs
    video_ids = get_video_ids(opts.query_txt_db)
    video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db,
                                      model_opts.vfeat_interval, model_opts)
    assert opts.split in opts.query_txt_db
    q_txt_db = QaQueryTokLmdb(opts.query_txt_db, -1)
    eval_dataset = ViolinEvalDataset(video_ids,
                                     video_db,
                                     q_txt_db,
                                     sampled_by_q=model_opts.sampled_by_q)
    collate_fn = violin_eval_collate

    # Prepare model
    if exists(opts.checkpoint):
        ckpt_file = opts.checkpoint
    else:
        ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt'
    checkpoint = torch.load(ckpt_file)
    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    assert img_pos_embed_weight_key in checkpoint
    max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])

    model = HeroForViolin.from_pretrained(model_config,
                                          state_dict=checkpoint,
                                          vfeat_dim=VFEAT_DIM,
                                          max_frm_seq_len=max_frm_seq_len)
    model.to(device)
    if opts.fp16:
        model = amp.initialize(model, enabled=opts.fp16, opt_level='O2')

    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=opts.batch_size,
                                 num_workers=opts.n_workers,
                                 pin_memory=opts.pin_mem,
                                 collate_fn=collate_fn)
    eval_dataloader = PrefetchLoader(eval_dataloader)

    _, results, logits = validate_violin(model, eval_dataloader, opts.split,
                                         opts.save_logits)
    result_dir = f'{opts.output_dir}/results_{opts.split}'
    if opts.save_logits:
        result_dir += '_w_logit'
    if not exists(result_dir) and hvd.rank() == 0:
        os.makedirs(result_dir)

    all_results = {}
    for id2res in all_gather_list(results):
        all_results.update(id2res)
    if opts.save_logits:
        all_logits = {}
        for id2logit in all_gather_list(logits):
            all_logits.update(id2logit)
    if hvd.rank() == 0:
        save_json(all_results,
                  f'{result_dir}/results_{opts.checkpoint}_all.json')
        LOGGER.info('All results written......')
        if opts.save_logits:
            save_pickle(all_logits,
                        f'{result_dir}/logits_{opts.checkpoint}_all.pkl')
            LOGGER.info('All logits written......')
Example #30
0
    def __register_impl(self, base, class_obj):
        self._registered_implementations[base.__name__] = class_obj

        LOGGER.debug("Custom implementation [%s] registered.", class_obj)
Example #31
0
File: itm.py Project: zmykevin/UC2
def get_hard_negs(model, loader, hard_negative_num=20):
    LOGGER.info("start running hard negative extraction")
    st = time()
    if hvd.rank() == 0:
        pbar = tqdm(total=len(loader))
    else:
        pbar = NoOp()
    model.eval()

    txt2hardimgs = {}
    img_to_score_txts = defaultdict(list)
    for batch in loader:
        scores = model(batch, compute_loss=False).squeeze(-1)
        txt = batch['gt_txt_id']
        imgs = batch['neg_img_ids']
        # record hard images
        hard_indices = scores.topk(hard_negative_num, sorted=False)[1].tolist()
        txt2hardimgs[txt] = [imgs[i] for i in hard_indices]
        # record img2txts
        for i, img in enumerate(imgs):
            img_to_score_txts[img].append((scores[i].item(), txt))
        pbar.update(1)
    pbar.close()

    LOGGER.info("start computing hard texts from images...")
    n_less_neg = 0
    tot_text = 0
    img2hardtxts = {}
    # need to gather hard texts from all GPUs
    all_img_ids = [
        i for dset in loader.dataset.datasets for i in dset.all_img_ids
    ]
    all_img_ids = any_broadcast(all_img_ids, 0)
    for img in all_img_ids:
        score_txts = img_to_score_txts[img]
        scores, txts = map(
            list,
            unzip(pair for pairs in all_gather_list(score_txts)
                  for pair in pairs))
        if hvd.rank() != 0:
            # only rank 0 needs to compute
            continue
        tot_text += len(txts)
        if len(txts) < hard_negative_num:
            # not enough negatives
            hard_indices = range(len(txts))
            n_less_neg += 1
        else:
            hard_indices = torch.tensor(scores).topk(hard_negative_num,
                                                     sorted=False)[1].tolist()
        img2hardtxts[img] = [txts[i] for i in hard_indices]

    n_less_neg = sum(all_gather_list(n_less_neg))
    if n_less_neg:
        LOGGER.info(f"Warning: {n_less_neg} images did not "
                    f"sample enough negatives")
    LOGGER.info(f"hard negative extraction finished "
                f"in {int(time() - st)} seconds "
                f"({tot_text//len(img_to_score_txts)} texts per images)")

    model.train()
    return txt2hardimgs, img2hardtxts
Example #32
0
    def end_training(self):
        # Termination message
        print("\n" + "-" * 100)
        if self.terminate_training:
            LOGGER.info(
                "Training terminated early because the Validation {} did not improve for  {}  epochs"
                .format(self.config['optimize_for'], self.config['patience']))
        else:
            LOGGER.info(
                "Maximum epochs of {} reached. Finished training !!".format(
                    self.config['max_epoch']))

        print_test_stats(self.best_val_metrics, test=False)

        print("-" * 50 + "\n\t\tEvaluating on test set\n" + "-" * 50)
        if not self.config["no_model_checkpoints"]:
            if os.path.isfile(self.model_file):
                self.load_model()
                self.model.to(self.device)
            else:
                raise ValueError(
                    "No Saved model state_dict found for the chosen model...!!! \nAborting evaluation on test set..."
                    .format(self.config['model_name']))

            self.export_val_predictions(
            )  # Runs evaluation, no need to run it again here
            val_probs = torch.tensor(self.probs_list)
            val_labels = torch.tensor(self.labels_list)
            threshold = 0.5  # the default threshelod for binary classification
            # Uncomment below line if you have implemented this optional feature
            # threshold = find_optimal_threshold(val_probs, val_labels, metric="accuracy")
            best_val_metrics = standard_metrics(val_probs,
                                                val_labels,
                                                threshold=threshold,
                                                add_aucroc=False)
            LOGGER.info(
                "Optimal threshold on validation dataset: %.4f (accuracy=%4.2f%%)"
                % (threshold, 100.0 * best_val_metrics["accuracy"]))

            # Testing is in the standard form not possible, as we do not have any labels (gives an error in standard_metrics)
            # Instead, we should write out the predictions in the form of the leaderboard
            self.test_metrics = dict()
            for test_idx in range(len(self.config['test_loader'])):
                test_name = self.config['test_loader'][test_idx].dataset.name
                LOGGER.info("Export and testing on %s..." % test_name)
                if hasattr(self.config['test_loader'][test_idx].dataset, "data") and \
                   hasattr(self.config['test_loader'][test_idx].dataset.data, "labels") and \
                   self.config['test_loader'][test_idx].dataset.data.labels[0] == -1:  # Step 1: Find the optimal threshold on validation set
                    self.export_test_predictions(test_idx=test_idx,
                                                 threshold=threshold)
                    self.test_metrics[test_name] = dict()
                else:
                    test_idx_metrics, _ = self.eval_model(test=True,
                                                          test_idx=test_idx)
                    self.test_metrics[test_name] = test_idx_metrics
                    print_test_stats(test_idx_metrics, test=True)
                    self.export_val_predictions(test=True,
                                                test_idx=test_idx,
                                                threshold=threshold)
        else:
            LOGGER.info(
                "No model checkpoints were saved. Hence, testing will be skipped."
            )
            self.test_metrics = dict()

        self.export_metrics()

        self.config['writer'].close()

        if self.config['remove_checkpoints']:
            LOGGER.info("Removing checkpoint %s..." % self.model_file)
            os.remove(self.model_file)
Example #33
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if hvd.rank() != 0:
        LOGGER.disabled = True

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                             opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)
    opts.task = 'tvc'

    # train_examples = None
    LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, "
                f"{opts.vfeat_db}")
    video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db,
                                      opts.vfeat_interval, opts)

    # data loaders
    # train
    LOGGER.info(f"Loading train dataset {opts.train_db}")
    train_cap = CaptionTokLmdb(opts.train_db, opts.max_txt_len)
    train_dset = TvcTrainDataset(video_db, train_cap, opts.max_cap_per_vid)
    LOGGER.info(f"{sum(all_gather_list(len(train_dset)))} samples loaded")
    train_loader = build_dataloader(train_dset, opts.train_batch_size,
                                    TvcTrainDataset.collate, True, opts)

    # val
    LOGGER.info(f"Loading val dataset {opts.val_db}")
    val_cap = CaptionTokLmdb(opts.val_db, -1)
    val_dset = TvcValDataset(video_db, val_cap, -1)
    val_loader = build_dataloader(val_dset, opts.val_batch_size,
                                  TvcValDataset.collate, False, opts)
    if hvd.rank() == 0:
        evaluator = TVCEval(opts.val_ref)
    else:
        evaluator = NoOp()

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}

    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    if img_pos_embed_weight_key in checkpoint:
        max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])
    else:
        max_frm_seq_len = MAX_FRM_SEQ_LEN

    model = HeroForTvc.from_pretrained(opts.model_config,
                                       state_dict=checkpoint,
                                       vfeat_dim=VFEAT_DIM,
                                       max_frm_seq_len=max_frm_seq_len,
                                       lsr=opts.lsr)

    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=opts.fp16,
                                      opt_level='O2')

    # assumes roberta tokenizer only
    if hvd.local_rank() == 0:
        # quick hack to prevent multi-process download collision
        toker = RobertaTokenizer.from_pretrained('roberta-base')
        all_gather_list(None)
    else:
        all_gather_list(None)
        toker = RobertaTokenizer.from_pretrained('roberta-base')
    bos = toker.convert_tokens_to_ids(['<s>'])[0]
    eos = toker.convert_tokens_to_ids(['</s>'])[0]
    generator = TvcGenerator(model, opts.max_gen_step, bos, eos, opts.fp16)

    global_step = 0
    if rank == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        os.makedirs(join(opts.output_dir, 'results'))  # store val predictions
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()

    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    train_loss = RunningMeter('loss')
    n_vid = 0
    n_cap = 0
    n_epoch = 0
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    model.train()
    while True:
        for step, batch in enumerate(train_loader):
            n_vid += opts.train_batch_size
            n_cap += batch['cap_input_ids'].size(0)

            loss = model(batch, compute_loss=True)
            loss = loss.mean()
            train_loss(loss.item())

            delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
            with amp.scale_loss(loss, optimizer,
                                delay_unscale=delay_unscale) as scaled_loss:
                scaled_loss.backward()
                if not delay_unscale:
                    # gather gradients from every processes
                    # do this before unscaling to make sure every process uses
                    # the same gradient scale
                    grads = [
                        p.grad.data for p in model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    all_reduce_and_rescale_tensors(grads, float(1))

            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1

                # learning rate scheduling
                lr_this_step = get_lr_sched(global_step, opts)
                for i, param_group in enumerate(optimizer.param_groups):
                    if i == 0 or i == 1:
                        param_group['lr'] = lr_this_step * opts.lr_mul
                    elif i == 2 or i == 3:
                        param_group['lr'] = lr_this_step
                    else:
                        raise ValueError()
                TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

                # log loss
                TB_LOGGER.add_scalar(train_loss.name, train_loss.val,
                                     global_step)
                TB_LOGGER.step()

                # update model params
                if opts.grad_norm != -1:
                    grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                                opts.grad_norm)
                    TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
                optimizer.step()
                optimizer.zero_grad()
                pbar.update(1)

                if global_step % 100 == 0:
                    # monitor training throughput
                    LOGGER.info('-------------------------------------------')
                    LOGGER.info(f'Step {global_step}:')
                    tot_vid = sum(all_gather_list(n_vid))
                    vid_per_sec = int(tot_vid / (time() - start))
                    LOGGER.info(f'{tot_vid} videos trained at '
                                f'{vid_per_sec} vid/s')
                    tot_cap = sum(all_gather_list(n_cap))
                    cap_per_sec = int(tot_cap / (time() - start))
                    TB_LOGGER.add_scalar(f'perf/vid_per_s', vid_per_sec,
                                         global_step)
                    TB_LOGGER.add_scalar(f'perf/cap_per_s', cap_per_sec,
                                         global_step)

                if global_step % opts.valid_steps == 0:
                    LOGGER.info('===========================================')
                    LOGGER.info(f"Step {global_step}: start validation")
                    val_log, results = validate(val_loader, generator, toker,
                                                evaluator)
                    if hvd.rank() == 0:
                        save_jsonl(
                            results, f"{opts.output_dir}/results/"
                            f"/results_{global_step}.jsonl")
                    TB_LOGGER.log_scaler_dict(val_log)
                    LOGGER.info('===========================================')
                    model_saver.save(model, global_step)
            if global_step >= opts.num_train_steps:
                break
        n_epoch += 1
        LOGGER.info(f"finished {n_epoch} epochs")
        if global_step >= opts.num_train_steps:
            break

    LOGGER.info('===========================================')
    if global_step % opts.valid_steps != 0:
        val_log, results = validate(val_loader, generator, toker, evaluator)
        if hvd.rank() == 0:
            save_jsonl(
                results, f"{opts.output_dir}/results/"
                f"/results_{global_step}.jsonl")
        TB_LOGGER.log_scaler_dict(val_log)
        model_saver.save(model, global_step)
Example #34
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if opts.train_config is not None:
        train_opts = Struct(json.load(open(opts.train_config)))
        opts.conf_th = train_opts.conf_th
        opts.max_bb = train_opts.max_bb
        opts.min_bb = train_opts.min_bb
        opts.num_bb = train_opts.num_bb

    # load DBs and image dirs
    eval_img_db = DetectFeatLmdb(opts.img_db, opts.conf_th, opts.max_bb,
                                 opts.min_bb, opts.num_bb, opts.compressed_db)
    eval_txt_db = TxtTokLmdb(opts.txt_db, -1)
    eval_dataset = ItmEvalDataset(eval_txt_db, eval_img_db, opts.batch_size)

    # Prepare model
    checkpoint = torch.load(opts.checkpoint)
    model = UniterForImageTextRetrieval.from_pretrained(opts.model_config,
                                                        checkpoint,
                                                        img_dim=IMG_DIM)
    if 'rank_output' not in checkpoint:
        model.init_output()  # zero shot setting

    model.to(device)
    model = amp.initialize(model, enabled=opts.fp16, opt_level='O2')

    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=1,
                                 num_workers=opts.n_workers,
                                 pin_memory=opts.pin_mem,
                                 collate_fn=itm_eval_collate)
    eval_dataloader = PrefetchLoader(eval_dataloader)

    eval_log, results = evaluate(model, eval_dataloader)
    if hvd.rank() == 0:
        if not exists(opts.output_dir) and rank == 0:
            os.makedirs(opts.output_dir)
        with open(f'{opts.output_dir}/config.json', 'w') as f:
            json.dump(vars(opts), f)
        with open(f'{opts.output_dir}/results.bin', 'wb') as f:
            pickle.dump(results, f)
        with open(f'{opts.output_dir}/scores.json', 'w') as f:
            json.dump(eval_log, f)
        LOGGER.info(f'evaluation finished')
        LOGGER.info(
            f"======================== Results =========================\n"
            f"image retrieval R1: {eval_log['img_r1']*100:.2f},\n"
            f"image retrieval R5: {eval_log['img_r5']*100:.2f},\n"
            f"image retrieval R10: {eval_log['img_r10']*100:.2f}\n"
            f"text retrieval R1: {eval_log['txt_r1']*100:.2f},\n"
            f"text retrieval R5: {eval_log['txt_r5']*100:.2f},\n"
            f"text retrieval R10: {eval_log['txt_r10']*100:.2f}")
        LOGGER.info("========================================================")
Example #35
0
def process_dataset(dataset_name, args):
    LOGGER.info('{:15} - Start'.format(dataset_name))
    LOGGER.info('{:15} - Retrieving trained embedding'.format(dataset_name))

    pre_trained_embedding = embeddings.get_embedding_model(
        args.pre_trained_embedding,
        binary=False,
        first_line_header=True,
        with_gensim=True)

    try:
        trained_embedding = dataset_helper.get_w2v_embedding_for_dataset(
            dataset_name)
    except FileNotFoundError as e:
        LOGGER.exception(e)
        return

    cmap_cache_files = dataset_helper.get_all_cached_graph_datasets(
        dataset_name=dataset_name, graph_type=constants.TYPE_CONCEPT_MAP)

    coo_cache_files = [
        x for x in dataset_helper.get_all_cached_graph_datasets(
            dataset_name=dataset_name, graph_type=constants.TYPE_COOCCURRENCE)
        if 'all' in x
    ]

    if not len(cmap_cache_files) or not len(coo_cache_files):
        return

    used_graphs = [cmap_cache_files[0], coo_cache_files[0]]

    LOGGER.info('{:15} - Retrieving dataset'.format(dataset_name))
    all_labels = set()
    for graph_cache_file in used_graphs:
        X, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        X = graph_helper.get_graphs_only(X)
        all_labels |= graph_helper.get_all_node_labels_uniq(
            X, as_sorted_list=False)

    LOGGER.info('{:15} - Resolving embeddings'.format(dataset_name))
    embeddings_pre_trained, not_found_pre_trained_coreferenced, not_found_trained, not_found_pre_trained, lookup, similar_els = embeddings.get_embeddings_for_labels_with_lookup(
        all_labels, trained_embedding, pre_trained_embedding)

    LOGGER.info('{:15} - Missing'.format(dataset_name))

    for label, s in [('trained', not_found_trained),
                     ('pre_trained', not_found_pre_trained),
                     ('after_coreference', not_found_pre_trained_coreferenced)
                     ]:
        LOGGER.info('\t{:20} {:>6}'.format(label, len(s)))

    embedding_file = '{}/{}.w2v.txt'.format(args.embeddings_result_folder,
                                            dataset_name)
    embeddings.save_embedding_dict(embeddings_pre_trained, embedding_file)
    embeddings_pre_trained = embeddings.load_word2vec_format(
        fname=embedding_file, binary=False)

    LOGGER.info('{:15} - Co-reference resolution'.format(dataset_name))
    max_topn = max(args.topn)

    similar_labels = coreference.get_most_similar_labels(
        all_labels, embeddings_pre_trained, max_topn)

    for topn in args.topn:
        for threshold in args.merge_threshold:
            LOGGER.info(
                '{:15} - Co-reference resolution: topn: {}, threshold: {}'.
                format(dataset_name, topn, threshold))
            clique_lookup = coreference.create_label_cliques_by_similarity(
                similar_labels, threshold=threshold, topn=topn)

            new_lookup = embeddings.merge_lookups(clique_lookup, lookup)

            with open(
                    '{}/{}.threshold-{}.topn-{}.label-lookup.npy'.format(
                        args.embeddings_result_folder, dataset_name, threshold,
                        topn), 'wb') as f:
                pickle.dump(new_lookup, f)
    LOGGER.info('{:15} - Finished'.format(dataset_name))
Example #36
0
def initialize():
    LOGGER.info('Initializing sensor module')
    SESSION_INFO.TOKEN = obtain_system_token()
    LOGGER.info('Sensor module initialized successfully')
Example #37
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                             opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)

    # train_examples = None
    LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, "
                f"{opts.train_img_dir}")
    if 'paired' in opts.model:
        DatasetCls = Nlvr2PairedDataset
        EvalDatasetCls = Nlvr2PairedEvalDataset
        collate_fn = nlvr2_paired_collate
        eval_collate_fn = nlvr2_paired_eval_collate
        if opts.model == 'paired':
            ModelCls = UniterForNlvr2Paired
        elif opts.model == 'paired-attn':
            ModelCls = UniterForNlvr2PairedAttn
        else:
            raise ValueError('unrecognized model type')
    elif opts.model == 'triplet':
        DatasetCls = Nlvr2TripletDataset
        EvalDatasetCls = Nlvr2TripletEvalDataset
        ModelCls = UniterForNlvr2Triplet
        collate_fn = nlvr2_triplet_collate
        eval_collate_fn = nlvr2_triplet_eval_collate
    else:
        raise ValueError('unrecognized model type')

    # data loaders
    train_dataloader = create_dataloader(opts.train_img_db, opts.train_txt_db,
                                         opts.train_batch_size, True,
                                         DatasetCls, collate_fn, opts)
    val_dataloader = create_dataloader(opts.val_img_db, opts.val_txt_db,
                                       opts.val_batch_size, False,
                                       EvalDatasetCls, eval_collate_fn, opts)
    test_dataloader = create_dataloader(opts.test_img_db, opts.test_txt_db,
                                        opts.val_batch_size, False,
                                        EvalDatasetCls, eval_collate_fn, opts)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}

    model = ModelCls.from_pretrained(opts.model_config,
                                     state_dict=checkpoint,
                                     img_dim=IMG_DIM)
    model.init_type_embedding()
    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=opts.fp16,
                                      opt_level='O2')

    global_step = 0
    if rank == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        os.makedirs(join(opts.output_dir, 'results'))  # store val predictions
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()

    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Num examples = %d", len(train_dataloader.dataset))
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    running_loss = RunningMeter('loss')
    model.train()
    n_examples = 0
    n_epoch = 0
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    while True:
        for step, batch in enumerate(train_dataloader):
            targets = batch['targets']
            n_examples += targets.size(0)

            loss = model(**batch, compute_loss=True)
            loss = loss.mean()
            delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
            with amp.scale_loss(loss, optimizer,
                                delay_unscale=delay_unscale) as scaled_loss:
                scaled_loss.backward()
                if not delay_unscale:
                    # gather gradients from every processes
                    # do this before unscaling to make sure every process uses
                    # the same gradient scale
                    grads = [
                        p.grad.data for p in model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    all_reduce_and_rescale_tensors(grads, float(1))

            running_loss(loss.item())

            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1

                # learning rate scheduling
                lr_this_step = get_lr_sched(global_step, opts)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

                # log loss
                losses = all_gather_list(running_loss)
                running_loss = RunningMeter(
                    'loss',
                    sum(l.val for l in losses) / len(losses))
                TB_LOGGER.add_scalar('loss', running_loss.val, global_step)
                TB_LOGGER.step()

                # update model params
                if opts.grad_norm != -1:
                    grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                                opts.grad_norm)
                    TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
                optimizer.step()
                optimizer.zero_grad()
                pbar.update(1)

                if global_step % 100 == 0:
                    # monitor training throughput
                    tot_ex = sum(all_gather_list(n_examples))
                    ex_per_sec = int(tot_ex / (time() - start))
                    LOGGER.info(f'Step {global_step}: '
                                f'{tot_ex} examples trained at '
                                f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec,
                                         global_step)

                if global_step % opts.valid_steps == 0:
                    for split, loader in [('val', val_dataloader),
                                          ('test', test_dataloader)]:
                        LOGGER.info(f"Step {global_step}: start running "
                                    f"validation on {split} split...")
                        log, results = validate(model, loader, split)
                        with open(
                                f'{opts.output_dir}/results/'
                                f'{split}_results_{global_step}_'
                                f'rank{rank}.csv', 'w') as f:
                            for id_, ans in results:
                                f.write(f'{id_},{ans}\n')
                        TB_LOGGER.log_scaler_dict(log)
                    model_saver.save(model, global_step)
            if global_step >= opts.num_train_steps:
                break
        if global_step >= opts.num_train_steps:
            break
        n_epoch += 1
        LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs")
    for split, loader in [('val', val_dataloader), ('test', test_dataloader)]:
        LOGGER.info(f"Step {global_step}: start running "
                    f"validation on {split} split...")
        log, results = validate(model, loader, split)
        with open(
                f'{opts.output_dir}/results/'
                f'{split}_results_{global_step}_'
                f'rank{rank}_final.csv', 'w') as f:
            for id_, ans in results:
                f.write(f'{id_},{ans}\n')
        TB_LOGGER.log_scaler_dict(log)
    model_saver.save(model, f'{global_step}_final')
    def next(self):
        if self._current >= len(self._queue):
            self._feed_queue()

        res = self._queue[self._current]

        LOGGER.debug('Parsing mail...')
        try:
            self._parser = MailReaderFactory.get_reader_for_mail(res)
            self._current = self._current + 1
        except Exception as ex:
            LOGGER.error('Error while parsing mail #%s', self._uids[self._current])
            LOGGER.error('Unable to determine source of this mail (raw content follows): %s', ex)
            LOGGER.error('Retrieved email:\n%s', res)

            LOGGER.debug('-- Recovery mode --')
            # Add this uid to the failed list so don't retry to parse this mail anymore
            self._failed_uids.append(self._uids[self._current])
            # Remove uid from the list so this email won't be deleted.
            self._uids.remove(self._uids[self._current])
            # Remove mail from the queue
            self._queue.remove(res)

            LOGGER.debug('Ok. Now, try to fetch another mail...')

            # Try to fetch next mail one more time...
            return self.next()

        return res
Example #39
0
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import f1_score

    with timer("データ処理"):
        cancer = load_breast_cancer()
        data = cancer.data
        target = cancer.target
        train_x, valid_x, train_y, valid_y = train_test_split(data,
                                                              target,
                                                              stratify=target,
                                                              random_state=2)

        train_x = pd.DataFrame(train_x, columns=cancer.feature_names)
        valid_x = pd.DataFrame(valid_x, columns=cancer.feature_names)

    with timer("モデルを推論"):
        preds = 0
        #ここらをstackモデルに変更
        for i, model in enumerate(models):
            oof_pred, pred = model.predict(train_x, valid_x, train_y)
            preds += pred

        LOGGER.info(
            f1_score(np.argmax(pred, axis=1), valid_y, average="binary"))
        LOGGER.info(
            f1_score(np.argmax(preds / len(models), axis=1),
                     valid_y,
                     average="binary"))  #seed average
    def __register_impl(self, base, class_obj):
        self._registered_implementations[base.__name__] = class_obj

        LOGGER.debug("Custom implementation [%s] registered.", class_obj)
Example #41
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    opts.rank = rank
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                             opts.gradient_accumulation_steps))

    set_random_seed(opts.seed)

    # train_examples = None
    LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, "
                f"{opts.train_img_db}")
    train_dataloader = create_dataloader(
        opts.train_img_db,
        opts.train_txt_db,
        opts.train_batch_size,
        True,
        VeDataset,
        ve_collate,
        opts,
    )
    val_dataloader = create_dataloader(
        opts.val_img_db,
        opts.val_txt_db,
        opts.val_batch_size,
        False,
        VeEvalDataset,
        ve_eval_collate,
        opts,
    )
    test_dataloader = create_dataloader(
        opts.test_img_db,
        opts.test_txt_db,
        opts.val_batch_size,
        False,
        VeEvalDataset,
        ve_eval_collate,
        opts,
    )

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}
    bert_model = json.load(open(f"{opts.train_txt_db}/meta.json"))["bert"]
    if "bert" not in bert_model:
        bert_model = "bert-large-cased"  # quick hack for glove exp
    model = UniterForVisualEntailment.from_pretrained(opts.model_config,
                                                      state_dict=checkpoint,
                                                      img_dim=IMG_DIM)
    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=opts.fp16,
                                      opt_level="O2")

    global_step = 0
    if rank == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, "log"))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, "ckpt"))
        pickle.dump(ans2label,
                    open(join(opts.output_dir, "ckpt", "ans2label.pkl"), "wb"))
        os.makedirs(join(opts.output_dir, "results"))  # store VQA predictions
        add_log_to_file(join(opts.output_dir, "log", "log.txt"))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()

    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Num examples = %d", len(train_dataloader.dataset))
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    running_loss = RunningMeter("loss")
    model.train()
    n_examples = 0
    n_epoch = 0
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    optimizer.step()
    while True:
        for step, batch in enumerate(train_dataloader):
            n_examples += batch["input_ids"].size(0)

            loss = model(batch, compute_loss=True)
            loss = loss.mean() * batch["targets"].size(1)  # instance-leval bce
            delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
            with amp.scale_loss(loss, optimizer,
                                delay_unscale=delay_unscale) as scaled_loss:
                scaled_loss.backward()
                if not delay_unscale:
                    # gather gradients from every processes
                    # do this before unscaling to make sure every process uses
                    # the same gradient scale
                    grads = [
                        p.grad.data for p in model.parameters()
                        if p.requires_grad and p.grad is not None
                    ]
                    all_reduce_and_rescale_tensors(grads, float(1))

            running_loss(loss.item())

            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1

                # learning rate scheduling
                lr_this_step = get_lr_sched(global_step, opts)
                for param_group in optimizer.param_groups:
                    param_group["lr"] = lr_this_step
                TB_LOGGER.add_scalar("lr", lr_this_step, global_step)

                # log loss
                # NOTE: not gathered across GPUs for efficiency
                TB_LOGGER.add_scalar("loss", running_loss.val, global_step)
                TB_LOGGER.step()

                # update model params
                if opts.grad_norm != -1:
                    grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                                opts.grad_norm)
                    TB_LOGGER.add_scalar("grad_norm", grad_norm, global_step)
                optimizer.step()
                optimizer.zero_grad()
                pbar.update(1)

                if global_step % 100 == 0:
                    # monitor training throughput
                    LOGGER.info(f"============Step {global_step}=============")
                    tot_ex = sum(all_gather_list(n_examples))
                    ex_per_sec = int(tot_ex / (time() - start))
                    LOGGER.info(f"{tot_ex} examples trained at "
                                f"{ex_per_sec} ex/s")
                    TB_LOGGER.add_scalar("perf/ex_per_s", ex_per_sec,
                                         global_step)
                    LOGGER.info(f"===========================================")

                if global_step % opts.valid_steps == 0:
                    for split, loader in [
                        ("val", val_dataloader),
                        ("test", test_dataloader),
                    ]:
                        LOGGER.info(f"Step {global_step}: start running "
                                    f"validation on {split} split...")
                        val_log, results = validate(model, loader, label2ans,
                                                    split)
                        with open(
                                f"{opts.output_dir}/results/"
                                f"{split}_results_{global_step}_"
                                f"rank{rank}.json",
                                "w",
                        ) as f:
                            json.dump(results, f)
                        TB_LOGGER.log_scaler_dict(val_log)
                    model_saver.save(model, global_step)
            if global_step >= opts.num_train_steps:
                break
        if global_step >= opts.num_train_steps:
            break
        n_epoch += 1
        LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs")
    if opts.num_train_steps % opts.valid_steps != 0:
        for split, loader in [("val", val_dataloader),
                              ("test", test_dataloader)]:
            LOGGER.info(f"Step {global_step}: start running "
                        f"validation on {split} split...")
            val_log, results = validate(model, loader, label2ans, split)
            with open(
                    f"{opts.output_dir}/results/"
                    f"{split}_results_{global_step}_"
                    f"rank{rank}_final.json",
                    "w",
            ) as f:
                json.dump(results, f)
            TB_LOGGER.log_scaler_dict(val_log)
        model_saver.save(model, global_step)
Example #42
0
def inf_mlm(model,
            eval_loader,
            eval_len,
            label2ans,
            save_logits=False,
            task='mlm',
            predict_p=0,
            ensemble=1,
            text_only=False):
    LOGGER.info("start running evaluation {}...".format(task))
    model.eval()
    n_ex = 0
    st = time()
    results = []
    logits = {}
    pbar = tqdm(total=eval_len)
    for i, batch in enumerate(eval_loader):
        qids = batch['qids']

        scores = model(batch,
                       compute_loss=False,
                       task=task,
                       text_only=text_only)
        if scores.nelement() == 0:
            masked_toks = iter([])
        else:
            if predict_p > 0:
                assert predict_p <= 1, "Invalid prediction probability threshold {}".format(
                    predict_p)
                softmax_scores = torch.nn.Softmax(dim=1)(scores)
                max_scores = softmax_scores.max(dim=-1, keepdim=False)
                scores = max_scores[0].cpu().tolist()
                indices = max_scores[1].cpu().tolist()
                masked_toks = []
                for max_scores_i in range(0, len(scores)):
                    if scores[max_scores_i] >= predict_p:
                        masked_toks.append(indices[max_scores_i])
                    else:
                        masked_toks.append(-1)
            else:
                masked_toks = scores.max(dim=-1,
                                         keepdim=False)[1].cpu().tolist()
                if ensemble > 1:
                    masked_toks = torch.topk(scores, ensemble,
                                             dim=-1)[1].cpu().tolist()
            masked_toks = iter(masked_toks)
        for qid, q_toks in zip(qids, batch['input_ids']):
            predicted_toks = []
            for tok in q_toks:
                tok = tok.item()
                if tok == 103:
                    predicted_toks.append(next(masked_toks))
            results.append({
                'predicted_toks': predicted_toks,
                'question_id': qid
            })
        n_ex += len(qids)
        pbar.update(len(qids))
        # TODO: dont commit, for testing only
        #if i > 4:
        #    break
    n_ex = sum(all_gather_list(n_ex))
    tot_time = time() - st
    val_log = {'valid/ex_per_s': n_ex / tot_time}
    LOGGER.info(f"evaluation finished in {int(tot_time)} seconds "
                f"at {int(n_ex/tot_time)} examples per second")
    return val_log, results, logits
Example #43
0
'''
    Author: Ribbon Huang
    MongoDB的调用的封装
'''
from utils.logger import LOGGER
import pymongo
from conf.settings import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_SHEET, LOGGER_MONGO_NAME
from pymongo.errors import WriteError, WTimeoutError, ConnectionFailure
import numpy as np
import pandas as pd

# 记录日常日志
logger = LOGGER.createLogger(LOGGER_MONGO_NAME)


class MongoUse:
    def __init__(self):
        try:
            self.client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
        except ConnectionFailure:
            logger.warning('MongoDB ConnectionFailure')
        except TypeError:
            logger.warning('MongoDB Variables is error')

        db = self.client[MONGO_DB]
        self.sheet = db[MONGO_SHEET]

    def insertDb(self, info):
        try:
            self.sheet.insert(info)
        except WriteError:
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    opts.n_gpu = n_gpu
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))
    if hvd.rank() != 0:
        LOGGER.disabled = True
    set_random_seed(opts.seed)

    # train_examples = None
    LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, "
                f"{opts.vfeat_db}")
    video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db,
                                      opts.vfeat_interval, opts)

    # data loaders
    # train
    LOGGER.info(f"Loading the train QA dataset {opts.train_query_txt_db}")
    video_ids = get_video_ids(opts.train_query_txt_db)
    train_q_txt_db = QaQueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len)
    train_dataloaders = build_downstream_dataloaders([opts.task],
                                                     video_db,
                                                     video_ids,
                                                     True,
                                                     opts,
                                                     q_txt_db=train_q_txt_db,
                                                     shuffle=True)
    meta_loader = MetaLoader(train_dataloaders,
                             accum_steps=opts.gradient_accumulation_steps,
                             distributed=n_gpu > 1)
    meta_loader = PrefetchLoader(meta_loader)

    # val
    LOGGER.info(f"Loading the val QA dataset {opts.val_query_txt_db}")
    video_ids = get_video_ids(opts.val_query_txt_db)
    val_q_txt_db = QaQueryTokLmdb(opts.val_query_txt_db, -1)
    val_dataloaders = build_downstream_dataloaders([opts.task],
                                                   video_db,
                                                   video_ids,
                                                   False,
                                                   opts,
                                                   q_txt_db=val_q_txt_db)
    if opts.test_query_txt_db:
        LOGGER.info(f"Loading the test QA dataset {opts.test_query_txt_db}")
        video_ids = get_video_ids(opts.test_query_txt_db)
        test_q_txt_db = QaQueryTokLmdb(opts.test_query_txt_db, -1)
        test_dataloaders = build_downstream_dataloaders([opts.task],
                                                        video_db,
                                                        video_ids,
                                                        False,
                                                        opts,
                                                        q_txt_db=test_q_txt_db)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}
    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    if img_pos_embed_weight_key in checkpoint:
        max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])
    else:
        max_frm_seq_len = MAX_FRM_SEQ_LEN

    model = HeroForVideoQA.from_pretrained(opts.model_config,
                                           state_dict=checkpoint,
                                           vfeat_dim=VFEAT_DIM,
                                           max_frm_seq_len=max_frm_seq_len)

    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())}
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      num_losses=len(task2scaler),
                                      enabled=opts.fp16,
                                      opt_level='O2')
    restorer = TrainingRestorer(opts, model, optimizer)
    global_step = restorer.global_step
    TB_LOGGER.global_step = global_step
    if hvd.rank() == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        if not exists(join(opts.output_dir, 'results')):
            # store tvqa predictions
            os.makedirs(join(opts.output_dir, 'results'))
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()
        restorer = NoOp()

    if global_step > 0:
        pbar.update(global_step)
    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    task2loss = {
        task: RunningMeter(f'loss/{task}')
        for task in train_dataloaders.keys()
    }

    for obj in (f'{opts.task}_qa', f'{opts.task}_st_ed'):
        task2loss[obj] = RunningMeter(f'loss/{obj}')

    model.train()
    n_examples = defaultdict(int)
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    if global_step == 0:
        optimizer.step()
    for step, (task, batch) in enumerate(meta_loader):
        n_examples[task] += opts.train_batch_size

        loss = model(batch, task=task, compute_loss=True)

        loss_qa, loss_st_ed = loss
        loss = loss_qa + opts.lw_st_ed * loss_st_ed
        for n, ls in (('st_ed', loss_st_ed), ('qa', loss_qa)):
            ls = ls.item()
            task2loss[f'{task}_{n}'](ls)

        loss = loss.mean()
        task2loss[task](loss.item())

        delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
        with amp.scale_loss(loss,
                            optimizer,
                            delay_unscale=delay_unscale,
                            loss_id=task2scaler[task]) as scaled_loss:
            scaled_loss.backward()
            if not delay_unscale:
                # gather gradients from every processes
                # do this before unscaling to make sure every process uses
                # the same gradient scale
                grads = [
                    p.grad.data for p in model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                all_reduce_and_rescale_tensors(grads, float(1))

        if (step + 1) % opts.gradient_accumulation_steps == 0:
            global_step += 1

            # learning rate scheduling
            lr_this_step = get_lr_sched(global_step, opts)
            for i, param_group in enumerate(optimizer.param_groups):
                if i == 0 or i == 1:
                    param_group['lr'] = lr_this_step * opts.lr_mul
                elif i == 2 or i == 3:
                    param_group['lr'] = lr_this_step
                else:
                    raise ValueError()
            TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

            TB_LOGGER.log_scaler_dict({
                temp_loss.name: temp_loss.val
                for temp_loss in task2loss.values()
                if temp_loss.val is not None
            })
            TB_LOGGER.step()

            # update model params
            if opts.grad_norm != -1:
                grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                            opts.grad_norm)
                TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
            optimizer.step()
            optimizer.zero_grad()
            restorer.step()
            pbar.update(1)

            if global_step % 100 == 0:
                # monitor training throughput
                LOGGER.info('-------------------------------------------')
                LOGGER.info(f'Step {global_step}:')
                for t in train_dataloaders.keys():
                    tot_ex = sum(all_gather_list(n_examples[t]))
                    ex_per_sec = int(tot_ex / (time() - start))
                    LOGGER.info(f'{t}: {tot_ex} examples trained at '
                                f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec,
                                         global_step)

            if global_step % opts.valid_steps == 0:
                LOGGER.info('===========================================')
                LOGGER.info(f"Step {global_step}: start running validation")
                validate(model,
                         val_dataloaders,
                         "val",
                         opts,
                         global_step=global_step)
                if opts.test_query_txt_db:
                    validate(model,
                             test_dataloaders,
                             "test",
                             opts,
                             global_step=global_step)
                LOGGER.info('===========================================')
                model_saver.save(model, global_step)
        if global_step >= opts.num_train_steps:
            break

    LOGGER.info('===========================================')
    if global_step % opts.valid_steps != 0:
        LOGGER.info('===========================================')
        LOGGER.info(f"Step {global_step}: start running validation")
        validate(model, val_dataloaders, "val", opts, global_step=global_step)
        if opts.test_query_txt_db:
            validate(model,
                     test_dataloaders,
                     "test",
                     opts,
                     global_step=global_step)
        LOGGER.info('===========================================')
    model_saver.save(model, f'{global_step}_final')