def get_reader_for_mail(raw): """ Automatically detect the appropriate reader that will be able to read the passed e-mail. This method is static. :param str raw: The raw e-mail content :rtype: Object :return: An instance of :py:class:`parsing.mails.mailreader.AbstractMailReader` """ match = re.search( r'{}:\s(.*)'.format(secrets.SCORING_EMAIL['partner_header']), raw) if not match: raise Exception( 'Malformed input mail :: missing header [{}]'.format( secrets.SCORING_EMAIL['partner_header'])) source = match.group(1).strip() LOGGER.debug('Mail from %s', source) if source in ("AOL", "SignalSpam"): return arf.ArfReader(raw, source) elif source == "SpamCop": return spamcop.SpamcopReader(raw) raise Exception( 'Malformed input mail :: unknown value [{}] for header [{}]'. format(source, secrets.SCORING_EMAIL['partner_header']))
def push_ip_document(self, input_dict): """ Push a new document regarding an IP or update existing document to append new data. :param dict input_dict: Expect a dictionary having at least those fields: [IP, filename, weight, source, timestamp, raw] """ file_doc = self._build_file_document(input_dict) input_dict['filename'] = file_doc['filename'] if self.does_ip_exist(input_dict['ip']): LOGGER.debug('IP [%s] already exists. Update...', input_dict['ip']) self._ip_collection.update( {'ip': input_dict['ip']}, {'$push': { 'events': self._build_event_document(input_dict) }}) else: LOGGER.debug('Brand new IP [%s]. Insert...', input_dict['ip']) doc = self._build_full_document(input_dict) self._ip_collection.save(doc) self._ip_cache.append(input_dict['ip']) self._raw_collection.save(file_doc)
def _feed_queue(self, first_call=False): """ Read the next inqueued e-mails. :param bool first_call: Whether this is the first time this method is called (default False) """ if not first_call: self._delete_messages() self._queue = [] self._uids = [] self._current = 0 while not len(self._queue): _, data = self._imap.search(None, 'ALL') uids = data[0].split() msg_pack = uids[:10] if len(uids) > 10 else uids for num in msg_pack: # Skip mails that previously failed if num in self._failed_uids: continue _, raw_msg = self._imap.fetch(num, '(RFC822)') self._queue.append(raw_msg[0][1]) self._uids.append(num) if not len(self._queue): LOGGER.debug('No email retrieved. Waiting before retrying.') time.sleep(10)
def recreate_dataset_perm(cls, dataset_id, authorized_groups=None): logger.debug(f"looking for dataset <{dataset_id}>") dataset_object, _ = cls.objects.get_or_create(dataset_id=dataset_id) groups = dataset_object.default_groups if authorized_groups is not None: groups += authorized_groups LOGGER.debug("recreating groups: {}".format(groups)) for group_name in set(groups): group, _created = Group.objects.get_or_create(name=group_name) assign_perm("view", group, dataset_object)
def purge_old_documents(self): """ Archive IP sub-documents older than a month ago. These documents are moved into a dedicated archiving collection. """ total_count = 0 request = { 'events.timestamp': { '$lt': A_MONTH_AGO } } LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO) for doc in self._ip_collection.find(request): archives_bulk = [] for event in doc['events']: # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved. # This condition removes subdocuments that do not match. if event['timestamp'] < A_MONTH_AGO: archives_bulk.append({ 'ip': doc['ip'], 'filename': event['filename'], 'source': event['source'], 'weight': event['weight'], 'timestamp': event['timestamp'] }) result = self._archive_collection.insert(archives_bulk) total_count = total_count + len(result) result = self._ip_collection.update(request, { '$pull': { 'events': { 'timestamp': { '$lt': A_MONTH_AGO } } } }, multi=True) LOGGER.info('%d documents archived.', total_count) # Remove single entries result = self._ip_collection.remove({ 'events.timestamp': { '$exists': False } }, multi=True) LOGGER.info('%d single entries have been removed.', result['n'])
def purge_old_documents(self): """ Archive IP sub-documents older than a month ago. These documents are moved into a dedicated archiving collection. """ total_count = 0 request = {'events.timestamp': {'$lt': A_MONTH_AGO}} LOGGER.debug("Archiving events older than %d...", A_MONTH_AGO) for doc in self._ip_collection.find(request): archives_bulk = [] for event in doc['events']: # All documents having at least 1 timestamp < A_MONTH_AGO are retrieved. # This condition removes subdocuments that do not match. if event['timestamp'] < A_MONTH_AGO: archives_bulk.append({ 'ip': doc['ip'], 'filename': event['filename'], 'source': event['source'], 'weight': event['weight'], 'timestamp': event['timestamp'] }) result = self._archive_collection.insert(archives_bulk) total_count += len(result) self._ip_collection.update( request, {'$pull': { 'events': { 'timestamp': { '$lt': A_MONTH_AGO } } }}, multi=True) LOGGER.info('%d documents archived.', total_count) # Remove single entries result = self._ip_collection.remove( {'events.timestamp': { '$exists': False }}, multi=True) LOGGER.info('%d single entries have been removed.', result['n'])
def run(self): """ Run the parser. """ with mongo.Mongo() as database: current = self.next() while current: try: addr = self.get_ip(current) if not addr: LOGGER.info('Entry skipped because no specified IP.') current = self.next() continue if not utils.is_managed_ip(addr): LOGGER.debug('Not a managed IP [%s].', addr) current = self.next() continue doc_ts = int( time.mktime(self.get_date(current).timetuple())) if doc_ts < YESTERDAY: LOGGER.debug('This entry is too old [%s].', self.get_date(current)) current = self.next() continue document = { 'ip': addr, 'timestamp': doc_ts, 'weight': self.compute_weight(current), 'source': self.get_source(current), 'raw': self.get_raw(current) } database.push_ip_document(document) except Exception as exc: LOGGER.error('Unexpected error: %s [%s]', type(exc), exc.message) LOGGER.error(traceback.format_exc()) current = self.next() self.close()
def next(self): if self._current >= len(self._queue): self._feed_queue() res = self._queue[self._current] LOGGER.debug('Parsing mail...') try: self._parser = MailReaderFactory.get_reader_for_mail(res) self._current = self._current + 1 except Exception as ex: LOGGER.error('Error while parsing mail #%s', self._uids[self._current]) LOGGER.error('Unable to determine source of this mail (raw content follows): %s', ex) LOGGER.error('Retrieved email:\n%s', res) LOGGER.debug('-- Recovery mode --') # Add this uid to the failed list so don't retry to parse this mail anymore self._failed_uids.append(self._uids[self._current]) # Remove uid from the list so this email won't be deleted. self._uids.remove(self._uids[self._current]) # Remove mail from the queue self._queue.remove(res) LOGGER.debug('Ok. Now, try to fetch another mail...') # Try to fetch next mail one more time... return self.next() return res
def run(self): """ Run the parser. """ with mongo.Mongo() as database: current = self.next() while current: try: addr = self.get_ip(current) if not addr: LOGGER.info('Entry skipped because no specified IP.') current = self.next() continue if not utils.is_managed_ip(addr): LOGGER.debug('Not a managed IP [%s].', addr) current = self.next() continue doc_ts = int(time.mktime(self.get_date(current).timetuple())) if doc_ts < YESTERDAY: LOGGER.debug('This entry is too old [%s].', self.get_date(current)) current = self.next() continue document = { 'ip': addr, 'timestamp': doc_ts, 'weight': self.compute_weight(current), 'source': self.get_source(current), 'raw': self.get_raw(current) } database.push_ip_document(document) except Exception as exc: LOGGER.error('Unexpected error: %s [%s]', type(exc), exc.message) LOGGER.error(traceback.format_exc()) current = self.next() self.close()
def email_user(self, subject, message, from_email=None): if from_email is None: from_email = settings.DEFAULT_FROM_EMAIL override = None try: override = settings.EMAIL_OVERRIDE except Exception: LOGGER.debug("no email override; sending email") override = None if override: to_email = override else: to_email = self.email mail = send_mail(subject, message, from_email, [to_email]) LOGGER.info("email sent: to: <" + str(self.email) + ">") LOGGER.info("email sent: from: <" + str(from_email) + ">") LOGGER.info("email sent: subject: " + str(subject)) LOGGER.info("email sent: message: " + str(message)) return mail
def get_reader_for_mail(raw): """ Automatically detect the appropriate reader that will be able to read the passed e-mail. This method is static. :param str raw: The raw e-mail content :rtype: Object :return: An instance of :py:class:`parsing.mails.mailreader.AbstractMailReader` """ match = re.search(r'{}:\s(.*)'.format(settings.SCORING_EMAIL['partner_header']), raw) if not match: raise Exception('Malformed input mail :: missing header [{}]'.format(settings.SCORING_EMAIL['partner_header'])) source = match.group(1).strip() LOGGER.debug('Mail from %s', source) if source in ("AOL", "SignalSpam"): return arf.ArfReader(raw, source) elif source == "SpamCop": return spamcop.SpamcopReader(raw) raise Exception( 'Malformed input mail :: unknown value [{}] for header [{}]'.format(source, settings.SCORING_EMAIL['partner_header']) )
def push_ip_document(self, input_dict): """ Push a new document regarding an IP or update existing document to append new data. :param dict input_dict: Expect a dictionnary having at least those fields: [IP, filename, weight, source, timestamp, raw] """ file_doc = self._build_file_document(input_dict) input_dict['filename'] = file_doc['filename'] if self.does_ip_exist(input_dict['ip']): LOGGER.debug('IP [%s] already exists. Update...', input_dict['ip']) self._ip_collection.update( {'ip': input_dict['ip']}, {'$push': {'events': self._build_event_document(input_dict)}} ) else: LOGGER.debug('Brand new IP [%s]. Insert...', input_dict['ip']) doc = self._build_full_document(input_dict) self._ip_collection.save(doc) self._ip_cache.append(input_dict['ip']) self._raw_collection.save(file_doc)
def __register_impl(self, base, class_obj): self._registered_implementations[base.__name__] = class_obj LOGGER.debug("Custom implementation [%s] registered.", class_obj)
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # data loaders train_dataloaders = {} val_dataloaders = {} for target, t_r in zip(opts.targets, opts.targets_ratio): train_loaders, val_loaders = build_target_loaders( target, t_r, opts) # -> choose which task and get corrsponding task dataloder train_dataloaders.update(train_loaders) val_dataloaders.update(val_loaders) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" if img_pos_embed_weight_key in checkpoint: max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) else: max_frm_seq_len = MAX_FRM_SEQ_LEN if opts.load_partial_pretrained: # from roberta model = HeroForPretraining(VideoModelConfig(opts.model_config), vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.load_partial_pretrained(checkpoint, VFEAT_DIM, max_frm_seq_len, skip_layers=opts.skip_layer_loading) else: # continue training model = HeroForPretraining.from_pretrained( opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=opts.lw_neg_ctx, lw_neg_q=opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=opts.hard_pool_size, margin=opts.margin, use_all_neg=opts.use_all_neg, drop_svmr_prob=opts.drop_svmr_prob) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) all_gather_list(None) # sync to prevent slower rank to read training meta global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } for task in train_dataloaders.keys(): if task.startswith('vsm'): for obj in ('st_ed', 'neg_ctx', 'neg_q'): task2loss[f"{task}_{obj}"] = RunningMeter(f'loss/{task}_{obj}') model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() assert all(global_step == s for s in all_gather_list(global_step)) for step, (task, batch) in enumerate(meta_loader): LOGGER.debug(f"Task: {task}") # hard negative in VSM if len(opts.hard_negtiave_start_step) > 0: for i, hn_step in enumerate(opts.hard_negtiave_start_step): if global_step >= hn_step and hn_step != -1: model.set_hard_negative(True, opts.hard_pool_size[i], opts.hard_neg_weights[i]) # start-end loss if opts.train_span_start_step != -1 and\ global_step >= opts.train_span_start_step: model.set_train_st_ed(opts.lw_st_ed) train_task = task.split('_')[0] n_examples[task] += opts.train_batch_size loss = model(batch, task=train_task, compute_loss=True) if train_task == 'vsm': loss_st_ed, loss_neg_ctx, loss_neg_q = loss loss = loss_st_ed + loss_neg_ctx + loss_neg_q for n, ls, w in (('st_ed', loss_st_ed, opts.lw_st_ed), ('neg_ctx', loss_neg_ctx, opts.lw_neg_ctx), ('neg_q', loss_neg_q, opts.lw_neg_q)): ls = ls.item() if w: ls /= w task2loss[f'{task}_{n}'](ls) elif train_task == "mffr": loss = torch.sqrt(loss.sum(dim=1)) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] LOGGER.debug("before reduce grad") all_reduce_and_rescale_tensors(grads, float(1)) LOGGER.debug("after reduce grad") if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: only consider rank 0 for speed TB_LOGGER.log_scaler_dict({ ll.name: ll.val for ll in task2loss.values() if ll.val is not None }) TB_LOGGER.step() LOGGER.debug("before norm grad") # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) LOGGER.debug("after norm grad") LOGGER.debug("before optim step") optimizer.step() optimizer.zero_grad() pbar.update(1) LOGGER.debug("after optim step") if global_step % 100 == 0: LOGGER.debug("after gather stats") # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) LOGGER.debug("after gather stats") if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) LOGGER.info('===========================================') model_saver.save(model, global_step) # step restorer in the end to prevent missing validation checkpoint restorer.step() if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, opts) LOGGER.info('===========================================') model_saver.save(model, global_step)