def get_reader_for_mail(raw):
        """
            Automatically detect the appropriate reader that will be able to
            read the passed e-mail. This method is static.

            :param str raw: The raw e-mail content
            :rtype: Object
            :return: An instance of :py:class:`parsing.mails.mailreader.AbstractMailReader`
        """
        match = re.search(
            r'{}:\s(.*)'.format(secrets.SCORING_EMAIL['partner_header']), raw)
        if not match:
            raise Exception(
                'Malformed input mail :: missing header [{}]'.format(
                    secrets.SCORING_EMAIL['partner_header']))

        source = match.group(1).strip()

        LOGGER.debug('Mail from %s', source)
        if source in ("AOL", "SignalSpam"):
            return arf.ArfReader(raw, source)
        elif source == "SpamCop":
            return spamcop.SpamcopReader(raw)

        raise Exception(
            'Malformed input mail :: unknown value [{}] for header [{}]'.
            format(source, secrets.SCORING_EMAIL['partner_header']))
Exemple #2
0
    def push_ip_document(self, input_dict):
        """
            Push a new document regarding an IP or update existing document to
            append new data.

            :param dict input_dict: Expect a dictionary having at least those
                fields: [IP, filename, weight, source, timestamp, raw]
        """
        file_doc = self._build_file_document(input_dict)
        input_dict['filename'] = file_doc['filename']

        if self.does_ip_exist(input_dict['ip']):
            LOGGER.debug('IP [%s] already exists. Update...', input_dict['ip'])
            self._ip_collection.update(
                {'ip': input_dict['ip']},
                {'$push': {
                    'events': self._build_event_document(input_dict)
                }})
        else:
            LOGGER.debug('Brand new IP [%s]. Insert...', input_dict['ip'])
            doc = self._build_full_document(input_dict)
            self._ip_collection.save(doc)
            self._ip_cache.append(input_dict['ip'])

        self._raw_collection.save(file_doc)
Exemple #3
0
    def _feed_queue(self, first_call=False):
        """
            Read the next inqueued e-mails.

            :param bool first_call: Whether this is the first time this method is called (default False)
        """
        if not first_call:
            self._delete_messages()

        self._queue = []
        self._uids = []
        self._current = 0

        while not len(self._queue):
            _, data = self._imap.search(None, 'ALL')
            uids = data[0].split()
            msg_pack = uids[:10] if len(uids) > 10 else uids
            for num in msg_pack:
                # Skip mails that previously failed
                if num in self._failed_uids:
                    continue

                _, raw_msg = self._imap.fetch(num, '(RFC822)')
                self._queue.append(raw_msg[0][1])
                self._uids.append(num)

            if not len(self._queue):
                LOGGER.debug('No email retrieved. Waiting before retrying.')
                time.sleep(10)
Exemple #4
0
    def recreate_dataset_perm(cls, dataset_id, authorized_groups=None):
        logger.debug(f"looking for dataset <{dataset_id}>")
        dataset_object, _ = cls.objects.get_or_create(dataset_id=dataset_id)

        groups = dataset_object.default_groups
        if authorized_groups is not None:
            groups += authorized_groups
        LOGGER.debug("recreating groups: {}".format(groups))
        for group_name in set(groups):
            group, _created = Group.objects.get_or_create(name=group_name)
            assign_perm("view", group, dataset_object)
    def purge_old_documents(self):
        """
            Archive IP sub-documents older than a month ago. These documents are moved
            into a dedicated archiving collection.
        """
        total_count = 0
        request = {
            'events.timestamp': {
                '$lt': A_MONTH_AGO
            }
        }

        LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO)
        for doc in self._ip_collection.find(request):
            archives_bulk = []

            for event in doc['events']:
                # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved.
                # This condition removes subdocuments that do not match.
                if event['timestamp'] < A_MONTH_AGO:
                    archives_bulk.append({
                        'ip': doc['ip'],
                        'filename': event['filename'],
                        'source': event['source'],
                        'weight': event['weight'],
                        'timestamp': event['timestamp']
                    })

            result = self._archive_collection.insert(archives_bulk)
            total_count = total_count + len(result)

        result = self._ip_collection.update(request, {
            '$pull': {
                'events': {
                    'timestamp': {
                        '$lt': A_MONTH_AGO
                    }
                }
            }
        }, multi=True)
        LOGGER.info('%d documents archived.', total_count)

        # Remove single entries
        result = self._ip_collection.remove({
            'events.timestamp': {
                '$exists': False
            }
        }, multi=True)

        LOGGER.info('%d single entries have been removed.', result['n'])
Exemple #6
0
    def purge_old_documents(self):
        """
            Archive IP sub-documents older than a month ago. These documents are moved
            into a dedicated archiving collection.
        """
        total_count = 0
        request = {'events.timestamp': {'$lt': A_MONTH_AGO}}

        LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO)
        for doc in self._ip_collection.find(request):
            archives_bulk = []

            for event in doc['events']:
                # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved.
                # This condition removes subdocuments that do not match.
                if event['timestamp'] < A_MONTH_AGO:
                    archives_bulk.append({
                        'ip': doc['ip'],
                        'filename': event['filename'],
                        'source': event['source'],
                        'weight': event['weight'],
                        'timestamp': event['timestamp']
                    })

            result = self._archive_collection.insert(archives_bulk)
            total_count += len(result)

        self._ip_collection.update(
            request,
            {'$pull': {
                'events': {
                    'timestamp': {
                        '$lt': A_MONTH_AGO
                    }
                }
            }},
            multi=True)
        LOGGER.info('%d documents archived.', total_count)

        # Remove single entries
        result = self._ip_collection.remove(
            {'events.timestamp': {
                '$exists': False
            }}, multi=True)

        LOGGER.info('%d single entries have been removed.', result['n'])
    def run(self):
        """
            Run the parser.
        """
        with mongo.Mongo() as database:
            current = self.next()
            while current:
                try:
                    addr = self.get_ip(current)
                    if not addr:
                        LOGGER.info('Entry skipped because no specified IP.')
                        current = self.next()
                        continue

                    if not utils.is_managed_ip(addr):
                        LOGGER.debug('Not a managed IP [%s].', addr)
                        current = self.next()
                        continue

                    doc_ts = int(
                        time.mktime(self.get_date(current).timetuple()))
                    if doc_ts < YESTERDAY:
                        LOGGER.debug('This entry is too old [%s].',
                                     self.get_date(current))
                        current = self.next()
                        continue

                    document = {
                        'ip': addr,
                        'timestamp': doc_ts,
                        'weight': self.compute_weight(current),
                        'source': self.get_source(current),
                        'raw': self.get_raw(current)
                    }
                    database.push_ip_document(document)
                except Exception as exc:
                    LOGGER.error('Unexpected error: %s [%s]', type(exc),
                                 exc.message)
                    LOGGER.error(traceback.format_exc())

                current = self.next()
            self.close()
Exemple #8
0
    def next(self):
        if self._current >= len(self._queue):
            self._feed_queue()

        res = self._queue[self._current]

        LOGGER.debug('Parsing mail...')
        try:
            self._parser = MailReaderFactory.get_reader_for_mail(res)
            self._current = self._current + 1
        except Exception as ex:
            LOGGER.error('Error while parsing mail #%s', self._uids[self._current])
            LOGGER.error('Unable to determine source of this mail (raw content follows): %s', ex)
            LOGGER.error('Retrieved email:\n%s', res)

            LOGGER.debug('-- Recovery mode --')
            # Add this uid to the failed list so don't retry to parse this mail anymore
            self._failed_uids.append(self._uids[self._current])
            # Remove uid from the list so this email won't be deleted.
            self._uids.remove(self._uids[self._current])
            # Remove mail from the queue
            self._queue.remove(res)

            LOGGER.debug('Ok. Now, try to fetch another mail...')

            # Try to fetch next mail one more time...
            return self.next()

        return res
    def run(self):
        """
            Run the parser.
        """
        with mongo.Mongo() as database:
            current = self.next()
            while current:
                try:
                    addr = self.get_ip(current)
                    if not addr:
                        LOGGER.info('Entry skipped because no specified IP.')
                        current = self.next()
                        continue

                    if not utils.is_managed_ip(addr):
                        LOGGER.debug('Not a managed IP [%s].', addr)
                        current = self.next()
                        continue

                    doc_ts = int(time.mktime(self.get_date(current).timetuple()))
                    if doc_ts < YESTERDAY:
                        LOGGER.debug('This entry is too old [%s].', self.get_date(current))
                        current = self.next()
                        continue

                    document = {
                        'ip': addr,
                        'timestamp': doc_ts,
                        'weight': self.compute_weight(current),
                        'source': self.get_source(current),
                        'raw': self.get_raw(current)
                    }
                    database.push_ip_document(document)
                except Exception as exc:
                    LOGGER.error('Unexpected error: %s [%s]', type(exc), exc.message)
                    LOGGER.error(traceback.format_exc())

                current = self.next()
            self.close()
Exemple #10
0
    def email_user(self, subject, message, from_email=None):
        if from_email is None:
            from_email = settings.DEFAULT_FROM_EMAIL

        override = None
        try:
            override = settings.EMAIL_OVERRIDE
        except Exception:
            LOGGER.debug("no email override; sending email")
            override = None
        if override:
            to_email = override
        else:
            to_email = self.email

        mail = send_mail(subject, message, from_email, [to_email])
        LOGGER.info("email sent: to:      <" + str(self.email) + ">")
        LOGGER.info("email sent: from:    <" + str(from_email) + ">")
        LOGGER.info("email sent: subject: " + str(subject))
        LOGGER.info("email sent: message: " + str(message))

        return mail
    def get_reader_for_mail(raw):
        """
            Automatically detect the appropriate reader that will be able to
            read the passed e-mail. This method is static.

            :param str raw: The raw e-mail content
            :rtype: Object
            :return: An instance of :py:class:`parsing.mails.mailreader.AbstractMailReader`
        """
        match = re.search(r'{}:\s(.*)'.format(settings.SCORING_EMAIL['partner_header']), raw)
        if not match:
            raise Exception('Malformed input mail :: missing header [{}]'.format(settings.SCORING_EMAIL['partner_header']))

        source = match.group(1).strip()

        LOGGER.debug('Mail from %s', source)
        if source in ("AOL", "SignalSpam"):
            return arf.ArfReader(raw, source)
        elif source == "SpamCop":
            return spamcop.SpamcopReader(raw)

        raise Exception(
            'Malformed input mail :: unknown value [{}] for header [{}]'.format(source, settings.SCORING_EMAIL['partner_header'])
        )
    def push_ip_document(self, input_dict):
        """
            Push a new document regarding an IP or update existing document to
            append new data.

            :param dict input_dict: Expect a dictionnary having at least those
                fields: [IP, filename, weight, source, timestamp, raw]
        """
        file_doc = self._build_file_document(input_dict)
        input_dict['filename'] = file_doc['filename']

        if self.does_ip_exist(input_dict['ip']):
            LOGGER.debug('IP [%s] already exists. Update...', input_dict['ip'])
            self._ip_collection.update(
                {'ip': input_dict['ip']},
                {'$push': {'events': self._build_event_document(input_dict)}}
            )
        else:
            LOGGER.debug('Brand new IP [%s]. Insert...', input_dict['ip'])
            doc = self._build_full_document(input_dict)
            self._ip_collection.save(doc)
            self._ip_cache.append(input_dict['ip'])

        self._raw_collection.save(file_doc)
    def __register_impl(self, base, class_obj):
        self._registered_implementations[base.__name__] = class_obj

        LOGGER.debug("Custom implementation [%s] registered.", class_obj)
    def __register_impl(self, base, class_obj):
        self._registered_implementations[base.__name__] = class_obj

        LOGGER.debug("Custom implementation [%s] registered.", class_obj)
Exemple #15
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    opts.n_gpu = n_gpu
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))
    if hvd.rank() != 0:
        LOGGER.disabled = True

    set_random_seed(opts.seed)

    # data loaders
    train_dataloaders = {}
    val_dataloaders = {}
    for target, t_r in zip(opts.targets, opts.targets_ratio):
        train_loaders, val_loaders = build_target_loaders(
            target, t_r,
            opts)  # -> choose which task and get corrsponding task dataloder
        train_dataloaders.update(train_loaders)
        val_dataloaders.update(val_loaders)
    meta_loader = MetaLoader(train_dataloaders,
                             accum_steps=opts.gradient_accumulation_steps,
                             distributed=n_gpu > 1)
    meta_loader = PrefetchLoader(meta_loader)

    # Prepare model
    if opts.checkpoint:
        checkpoint = torch.load(opts.checkpoint)
    else:
        checkpoint = {}
    img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\
        ".position_embeddings.weight"
    if img_pos_embed_weight_key in checkpoint:
        max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])
    else:
        max_frm_seq_len = MAX_FRM_SEQ_LEN

    if opts.load_partial_pretrained:
        # from roberta
        model = HeroForPretraining(VideoModelConfig(opts.model_config),
                                   vfeat_dim=VFEAT_DIM,
                                   max_frm_seq_len=max_frm_seq_len,
                                   lw_neg_ctx=opts.lw_neg_ctx,
                                   lw_neg_q=opts.lw_neg_q,
                                   lw_st_ed=0,
                                   ranking_loss_type=opts.ranking_loss_type,
                                   use_hard_negative=False,
                                   hard_pool_size=opts.hard_pool_size,
                                   margin=opts.margin,
                                   use_all_neg=opts.use_all_neg,
                                   drop_svmr_prob=opts.drop_svmr_prob)
        model.load_partial_pretrained(checkpoint,
                                      VFEAT_DIM,
                                      max_frm_seq_len,
                                      skip_layers=opts.skip_layer_loading)
    else:
        # continue training
        model = HeroForPretraining.from_pretrained(
            opts.model_config,
            state_dict=checkpoint,
            vfeat_dim=VFEAT_DIM,
            max_frm_seq_len=max_frm_seq_len,
            lw_neg_ctx=opts.lw_neg_ctx,
            lw_neg_q=opts.lw_neg_q,
            lw_st_ed=0,
            ranking_loss_type=opts.ranking_loss_type,
            use_hard_negative=False,
            hard_pool_size=opts.hard_pool_size,
            margin=opts.margin,
            use_all_neg=opts.use_all_neg,
            drop_svmr_prob=opts.drop_svmr_prob)

    model.to(device)
    # make sure every process has same model parameters in the beginning
    broadcast_tensors([p.data for p in model.parameters()], 0)
    set_dropout(model, opts.dropout)

    # Prepare optimizer
    optimizer = build_optimizer(model, opts)
    task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())}
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      num_losses=len(task2scaler),
                                      enabled=opts.fp16,
                                      opt_level='O2')
    restorer = TrainingRestorer(opts, model, optimizer)
    all_gather_list(None)  # sync to prevent slower rank to read training meta
    global_step = restorer.global_step
    TB_LOGGER.global_step = global_step
    if hvd.rank() == 0:
        save_training_meta(opts)
        TB_LOGGER.create(join(opts.output_dir, 'log'))
        pbar = tqdm(total=opts.num_train_steps)
        model_saver = ModelSaver(join(opts.output_dir, 'ckpt'))
        add_log_to_file(join(opts.output_dir, 'log', 'log.txt'))
    else:
        pbar = NoOp()
        model_saver = NoOp()
        restorer = NoOp()

    if global_step > 0:
        pbar.update(global_step)
    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info("  Batch size = %d", opts.train_batch_size)
    LOGGER.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    LOGGER.info("  Num steps = %d", opts.num_train_steps)

    task2loss = {
        task: RunningMeter(f'loss/{task}')
        for task in train_dataloaders.keys()
    }
    for task in train_dataloaders.keys():
        if task.startswith('vsm'):
            for obj in ('st_ed', 'neg_ctx', 'neg_q'):
                task2loss[f"{task}_{obj}"] = RunningMeter(f'loss/{task}_{obj}')
    model.train()
    n_examples = defaultdict(int)
    start = time()
    # quick hack for amp delay_unscale bug
    optimizer.zero_grad()
    if global_step == 0:
        optimizer.step()
    assert all(global_step == s for s in all_gather_list(global_step))
    for step, (task, batch) in enumerate(meta_loader):
        LOGGER.debug(f"Task: {task}")

        # hard negative in VSM
        if len(opts.hard_negtiave_start_step) > 0:
            for i, hn_step in enumerate(opts.hard_negtiave_start_step):
                if global_step >= hn_step and hn_step != -1:
                    model.set_hard_negative(True, opts.hard_pool_size[i],
                                            opts.hard_neg_weights[i])

        # start-end loss
        if opts.train_span_start_step != -1 and\
                global_step >= opts.train_span_start_step:
            model.set_train_st_ed(opts.lw_st_ed)

        train_task = task.split('_')[0]
        n_examples[task] += opts.train_batch_size

        loss = model(batch, task=train_task, compute_loss=True)
        if train_task == 'vsm':
            loss_st_ed, loss_neg_ctx, loss_neg_q = loss
            loss = loss_st_ed + loss_neg_ctx + loss_neg_q
            for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed),
                             ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx),
                             ('neg_q', loss_neg_q, opts.lw_neg_q)):
                ls = ls.item()
                if w:
                    ls /= w
                task2loss[f'{task}_{n}'](ls)
        elif train_task == "mffr":
            loss = torch.sqrt(loss.sum(dim=1))

        loss = loss.mean()
        task2loss[task](loss.item())

        delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0
        with amp.scale_loss(loss,
                            optimizer,
                            delay_unscale=delay_unscale,
                            loss_id=task2scaler[task]) as scaled_loss:
            scaled_loss.backward()
            if not delay_unscale:
                # gather gradients from every processes
                # do this before unscaling to make sure every process uses
                # the same gradient scale
                grads = [
                    p.grad.data for p in model.parameters()
                    if p.requires_grad and p.grad is not None
                ]
                LOGGER.debug("before reduce grad")
                all_reduce_and_rescale_tensors(grads, float(1))
                LOGGER.debug("after reduce grad")

        if (step + 1) % opts.gradient_accumulation_steps == 0:
            global_step += 1

            # learning rate scheduling
            lr_this_step = get_lr_sched(global_step, opts)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            TB_LOGGER.add_scalar('lr', lr_this_step, global_step)

            # log loss
            # NOTE: only consider rank 0 for speed
            TB_LOGGER.log_scaler_dict({
                ll.name: ll.val
                for ll in task2loss.values() if ll.val is not None
            })
            TB_LOGGER.step()

            LOGGER.debug("before norm grad")
            # update model params
            if opts.grad_norm != -1:
                grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                            opts.grad_norm)
                TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step)
            LOGGER.debug("after norm grad")
            LOGGER.debug("before optim step")
            optimizer.step()
            optimizer.zero_grad()
            pbar.update(1)
            LOGGER.debug("after optim step")

            if global_step % 100 == 0:
                LOGGER.debug("after gather stats")
                # monitor training throughput
                LOGGER.info('-------------------------------------------')
                LOGGER.info(f'Step {global_step}:')
                for t in train_dataloaders.keys():
                    tot_ex = sum(all_gather_list(n_examples[t]))
                    ex_per_sec = int(tot_ex / (time() - start))
                    LOGGER.info(f'{t}: {tot_ex} examples trained at '
                                f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec,
                                         global_step)
                LOGGER.debug("after gather stats")

            if global_step % opts.valid_steps == 0:
                LOGGER.info('===========================================')
                LOGGER.info(f"Step {global_step}: start running validation")
                validate(model, val_dataloaders, opts)
                LOGGER.info('===========================================')
                model_saver.save(model, global_step)

            # step restorer in the end to prevent missing validation checkpoint
            restorer.step()
        if global_step >= opts.num_train_steps:
            break

    LOGGER.info('===========================================')
    if global_step % opts.valid_steps != 0:
        LOGGER.info('===========================================')
        LOGGER.info(f"Step {global_step}: start running validation")
        validate(model, val_dataloaders, opts)
        LOGGER.info('===========================================')
        model_saver.save(model, global_step)