Example #1
0
    def test(self, testset):
        ranking = {}
        y_real, y_pred = [], []
        for i, qid in enumerate(testset):
            ranking[qid] = []
            percentage = round(float(i + 1) / len(testset), 2)
            print('Progress: ', percentage, sep='\t', end='\r')

            q1 = testset[qid]
            q1 = utils.binarize(utils.parse_tree(q1['tree']))
            q1_vec, _ = self.expr_for_tree(q1['root'], q1)

            duplicates = testset[qid]['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                rel_question_id = rel_question['id']
                q2 = rel_question['tree']
                q2 = utils.binarize(utils.parse_tree(q2))
                q2_vec, _ = self.expr_for_tree(q2['root'], q2)

                x = dy.concatenate([q1_vec, q2_vec])
                probs = dy.softmax(self.W * x + self.bW).vec_value()
                score = probs.index(max(probs))
                y_pred.append(score)

                if rel_question['relevance'] != 'Irrelevant':
                    y_real.append(1)
                else:
                    y_real.append(0)
                ranking[qid].append((score, score, rel_question_id))
            dy.renew_cg()

        gold = utils.prepare_gold(GOLD_PATH)
        map_baseline, map_model = utils.evaluate(gold, ranking)
        f1score = f1_score(y_real, y_pred)
        return map_baseline, map_model, f1score
Example #2
0
def handle_client(conn, addr):
    print(f'[NEW CONNECTION] {addr} connected.')

    connected = True
    while connected:
        text = []
        msg_length = conn.recv(HEADER).decode(FORMAT)
        if msg_length:
            msg_length = int(msg_length)
            print(f"msgl = {msg_length}")
            msg = conn.recv(msg_length).decode(FORMAT)

            if msg == DISCONNECT_MSG:
                connected = False
                break
            print(f"[{addr}] : {msg}")

            text.append(str(msg))
            ws = ws_driver(text)
            pos = pos_driver(ws)
            parse_input = prepro(ws, pos)
            ParsTree = parse_tree(ps.apply_list(parse_input)[0])
            root = pt.make_tree(ParsTree)

            t = pt.list_of_leaves(root)
            k = pt.getkeywords(t)
            keywords = ""
            for word in k:
                keywords += " " + word

            # ParsTree = CoreN.get_parse_tree(t2s(msg))
            # NodeTree = make_tree(ParsTree)
            # Leaves = list_of_leaves(NodeTree)
            # keywords = inorder(Leaves)
            # keywords = s2t(keywords)

            if (keywords == ""):
                keywords = "None"
                print(keywords)

        conn.send(keywords.encode(FORMAT))
    conn.close()
Example #3
0
 def scrap_match_ids(self, league_id):
     page_num = 1
     match_ids = []
     while True:
         league_data_url = f'/ajax-sport-country-tournament-archive/1/{league_id}/X0/1/0/{page_num}/'
         json_value = self.fetch_url(league_data_url)
         json_value = json_value['d']['html']
         tree = parse_tree(json_value)
         tbody_node = tree.find('html').find('body').find('table').find(
             'tbody')
         if tbody_node is None:
             break
         trs = tbody_node.find_all('tr')
         new_match_ids = [tr['xeid'] for tr in trs if tr.has_attr('xeid')]
         if len(new_match_ids) == 0:
             break
         match_ids.extend(new_match_ids)
         page_num += 1
     match_ids.reverse()
     return match_ids
Example #4
0
async def alert_init(client):
    plugins = [(handler.user_callback
                if hasattr(handler, 'user_callback') else handler.callback)
               for group in client.dispatcher.groups.values()
               for handler in group]

    plugins_count = len(plugins)
    plugins_names = []
    for plugin_callback in plugins:
        members = {
            key: value
            for key, value in inspect.getmembers(plugin_callback)
        }
        full_name = f"{members['__globals__']['__name__']}.{members['__name__']}"
        plugins_names.append(full_name)
    plugins_text = utils.tree(utils.parse_tree(plugins_names))
    started_text = config.langs.start_log(plugins_count=plugins_count,
                                          plugins_names=plugins_names,
                                          plugins_text=plugins_text,
                                          client=client)

    await client.send_message(logs_chat, started_text)
Example #5
0
  def update_state_hierarchical(self, groundtruth_data, detections, hierarchical_scores,
                                hierarchical_classes, tree_filename):
    """Update detection results and groundtruth data.

    Similar to update_state(), difference in this function is that it walks up the hierarchy and
    resets the predicted class, in order to match with the groundtruth.
    For example, given gt as 'Person', if the nearest bbox prediction is 'Girl',
    we walk up the tree to find if 'Person' exists in parents. If it exists, set the nearest
    predicted bbox class as 'Person', as for the score, sum up corresponding leaf predicted score
    of that bbox. In this case, Person has leaves as ['Woman', 'Man', 'Boy', 'Girl']. We sum up
    scores of the four classes, take min(new_score, 1.0), set the bbox score as that value.
    Args:
      groundtruth_data: Groundtruth annotations in a tensor with each row
        representing [y1, x1, y2, x2, class].
      detections: Detection results in a tensor with each row representing
        [image_id, x, y, width, height, score, class].
      hierarchical_scores: [batch_size, num_boxes, num_classes], note:num_boxes is remaining bboxes after nms
      hierarchical_classes: [batch_size, num_boxes, num_classes].
      tree_filename: string file name.
    """
    tree_leaf2root, sumrule = utils.parse_tree(tree_filename)

    batch_parent_scores = []
    batch_parent_classes = []

    new_bbox_scores = hierarchical_scores[:, :, 0]
    new_bbox_classes = hierarchical_classes[:, :, 0]

    for i, det in enumerate(detections):
      # Filter out detections with the top1 predicted class label = -1.
      indices = np.where(hierarchical_classes[i, :, 0] > -1)[0]
      hierarchical_classes[i][indices].astype(int)
      det = det[indices]
      if det.shape[0] == 0:
        continue
      # Append groundtruth annotations to create COCO dataset object.
      # Add images.
      image_id = det[0, 0]
      if image_id == -1:
        image_id = self.image_id
      det[:, 0] = image_id

      max_levels = 3
      per_bbox_parent_scores = []  # each leaf score corresponds to maximum of max_level parents
      per_bbox_parent_classes = []

      # find parent scores and classes for each bbox
      for _scores, _classes in zip(hierarchical_scores[i], hierarchical_classes[i]):
        per_class_parent_scores = []
        per_class_parent_classes = []
        for _s, _c in zip(_scores, _classes):
          parent_scores = [-1] * max_levels
          parent_classes = [-1] * max_levels
          parents = self.get_leaf_to_parent_path(tree_leaf2root, _c)[:max_levels]
          np_classes = np.asarray(_classes)
          np_scores = np.asarray(_scores)
          for _ii, _p in enumerate(parents):
            np_leaves = np.asarray(sumrule[_p])
            overlap_leaves, indices1, indices2 = np.intersect1d(
                np_leaves, np_classes, return_indices=True)
            _p_score = min(np_scores[indices2].sum(), 1.0)
            parent_scores[_ii] = _p_score
            parent_classes[_ii] = _p
          per_class_parent_scores.append(parent_scores)
          per_class_parent_classes.append(parent_classes)

        per_bbox_parent_scores.append(per_class_parent_scores)
        per_bbox_parent_classes.append(per_class_parent_classes)

      batch_parent_scores.append(per_bbox_parent_scores)
      batch_parent_classes.append(per_bbox_parent_classes)

      if not self.filename and not self.testdev_dir:
        # process groudtruth data only if filename is empty and no test_dev.
        self.dataset['images'].append({
            'id': int(image_id),
        })

        # Add annotations.
        indices = np.where(groundtruth_data[i, :, -1] > -1)[0]
        for data in groundtruth_data[i, indices]:
          box = data[0:4]
          category_id = data[4]
          area = (box[3] - box[1]) * (box[2] - box[0])
          if category_id < 0:
            break
          # find predicted bbox that has the largest IOU with gt, reset its' prediction
          _iou, _iou_max, _idx = self.get_max_iou(detections[i][:, 1:5], box)
          matched_bbox_parent_classes = per_bbox_parent_classes[_idx]  #shape: (k, max_levels)
          matched_bbox_parent_classes = np.asarray(matched_bbox_parent_classes)

          matched_bbox_parent_scores = per_bbox_parent_scores[_idx]
          matched_bbox_parent_scores = np.asarray(matched_bbox_parent_scores)

          parent_match_bbox_idx, parent_match_class_idx = np.where(
              matched_bbox_parent_classes == category_id)
          # get the top1 score for a matching parent class
          if len(parent_match_bbox_idx) > 0 and len(parent_match_class_idx) > 0:
            parent_match_bbox_idx = min(parent_match_bbox_idx)
            parent_match_class_idx = min(parent_match_class_idx)
            new_bbox_classes[i][_idx] = matched_bbox_parent_classes[parent_match_bbox_idx][
                parent_match_class_idx]
            new_bbox_scores[i][_idx] = matched_bbox_parent_scores[parent_match_bbox_idx][
                parent_match_class_idx]

          self.dataset['annotations'].append({
              'id': int(self.annotation_id),
              'image_id': int(image_id),
              'iscrowd': False,
              'category_id': int(category_id),
              'bbox': [box[1], box[0], box[3] - box[1], box[2] - box[0]],
              'area': area,
          })
          self.annotation_id += 1
          self.category_ids.append(category_id)

      det[:, 5] = new_bbox_scores[i]
      det[:, 6] = new_bbox_classes[i]

      self.detections.extend(det)
      self.image_id += 1

    if not self.filename:
      self.category_ids = list(set(self.category_ids))
      self.dataset['categories'] = [{'id': int(category_id)} for category_id in self.category_ids]
Example #6
0
    def train(self):
        dy.renew_cg()
        trainer = dy.AdamTrainer(self.model)

        early = 0.0
        best = -1
        for epoch in range(self.EPOCH):
            print('\n')
            dy.renew_cg()
            losses = []
            closs = 0
            batch_timing = []
            for i, trainrow in enumerate(self.traindata):
                start = time.time()
                q1 = utils.binarize(utils.parse_tree(trainrow['q1_tree']))
                q2 = utils.binarize(utils.parse_tree(trainrow['q2_tree']))
                label = trainrow['label']

                loss = self.get_loss(q1, q2, label)
                losses.append(loss)

                if len(losses) == self.BATCH:
                    loss = dy.esum(losses)
                    # loss += self.regularization_loss()
                    _loss = loss.value()
                    closs += _loss
                    loss.backward()
                    trainer.update()
                    dy.renew_cg()

                    # percentage of trainset processed
                    percentage = str(
                        round((float(i + 1) / len(self.traindata)) * 100,
                              2)) + '%'
                    # time of epoch processing
                    time_epoch = round(
                        sum(batch_timing) / float(len(batch_timing)), 2)

                    print(
                        "Epoch: {0} \t\t Loss: {1} \t\t Epoch time: {2} \t\t Trainset: {3}"
                        .format(epoch + 1, round(_loss, 2), time_epoch,
                                percentage),
                        end='       \r')
                    losses = []
                    batch_timing = []
                end = time.time()
                t = (end - start)
                batch_timing.append(t)

            log = "Epoch: {0} \t\t Loss: {1} \t\t Best: {2}".format(
                epoch + 1, round(closs / self.BATCH, 2), round(best, 2))
            print('\n' + log)

            log = 'Dev evaluation...'
            print(log)
            map_baseline, map_model, f1score = self.test(self.devset)

            print('MAP Model: ',
                  round(map_model, 2),
                  'MAP baseline: ',
                  round(map_baseline, 2),
                  'F1 score: ',
                  str(round(f1score, 2)),
                  sep='\t',
                  end='\n')

            trainer.learning_rate *= 0.95
            if map_model > best:
                best = copy.copy(map_model)
                early = 0
                # path = self.fname() + '.dy'
                # self.model.save(os.path.join(EVALUATION_PATH, path))
            else:
                early += 1

            if early == self.EARLY_STOP:
                break
Example #7
0
import socket
import re
import threading
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger
from utils_parser.parser_util import get_parser_tree_cls, NodeType
import ckip_classic.client
from utils import prepro, parse_tree

msg = '如果是在夜間或其他時後遭到家庭暴力'
text = []
# Initialize drivers with custom checkpoints
ws_driver = CkipWordSegmenter(level=3, device=0)
pos_driver = CkipPosTagger(level=3, device=0)
ps = ckip_classic.client.CkipParserClient(username='******',
                                          password='******')
pt = get_parser_tree_cls(NodeType.Origin_Ckip)

text.append(str(msg))
ws = ws_driver(text)
pos = pos_driver(ws)
parse_input = prepro(ws, pos)
print(parse_input)
ParsTree = parse_tree(ps.apply_list(parse_input)[0])
print("pas", ParsTree)
root = pt.make_tree(ParsTree)
t = pt.list_of_leaves(root)
k = pt.get_keywords(t)
keywords = ""
for word in k:
    keywords += " " + word
print(keywords)
Example #8
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    utils.image('input_image', features)
    training_hooks = []

    def _model_outputs(inputs):
        # Convert params (dict) to Config for easier access.
        return model(inputs, config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, _model_outputs, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    sumrule = None
    if params.get('tree'):
        _, sumrule = utils.parse_tree(params['tree'])
        params['sumrule'] = sumrule

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if params['iou_loss_type']:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()
    if params['strategy'] == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        learning_rate = learning_rate * hvd.size()
    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        elif params['strategy'] == 'horovod':
            optimizer = hvd.DistributedOptimizer(optimizer)
            training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', False):
                detections_bs = []
                for index in range(kwargs['boxes'].shape[0]):
                    nms_configs = params['nms_configs']
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                if params.get('tree'):
                    nms_boxes, nms_scores, nms_classes, _ = postprocess.hierarchical_nms(
                        params,
                        kwargs['boxes'],
                        kwargs['scores'],
                        kwargs['image_scales'],
                        k=10)
                    img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                      nms_scores.dtype)
                    detections_bs = [
                        img_ids *
                        tf.ones_like(nms_boxes[:, :, 1], dtype=tf.float32),
                        nms_boxes[:, :, 1],
                        nms_boxes[:, :, 0],
                        nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                        nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                        nms_scores[:, :, 0],
                        nms_classes[:, :, 0],
                    ]
                else:
                    nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                        params, kwargs['boxes'], kwargs['scores'],
                        kwargs['classes'], kwargs['image_scales'])
                    img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                      nms_scores.dtype)
                    detections_bs = [
                        img_ids * tf.ones_like(nms_scores),
                        nms_boxes[:, :, 1],
                        nms_boxes[:, :, 0],
                        nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                        nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                        nms_scores,
                        nms_classes,
                    ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'])

                if params.get('tree', None):
                    #TODO: Add in hierarchy file here?
                    coco_metrics = eval_metric.estimator_metric_fn(
                        detections_bs, kwargs['groundtruth_data'], nms_scores,
                        nms_classes, params['tree'])
                else:
                    coco_metrics = eval_metric.estimator_metric_fn(
                        detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS

        if params.get('tree'):
            boxes, scores, classes = postprocess.pre_nms(params,
                                                         cls_outputs,
                                                         box_outputs,
                                                         topk=False)
        else:
            boxes, scores, classes = postprocess.pre_nms(params,
                                                         cls_outputs,
                                                         box_outputs,
                                                         topk=True)

        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if params['strategy'] != 'tpu':
        # Profile every 1K steps.
        profile_hook = tf.train.ProfilerHook(save_steps=1000,
                                             output_dir=params['model_dir'])
        training_hooks.append(profile_hook)

        # Report memory allocation if OOM
        class OomReportingHook(tf.estimator.SessionRunHook):
            def before_run(self, run_context):
                return tf.estimator.SessionRunArgs(
                    fetches=[],
                    options=tf.RunOptions(
                        report_tensor_allocations_upon_oom=True))

        training_hooks.append(OomReportingHook())

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn,
                                             training_hooks=training_hooks)
Example #9
0
    def scrap(self, country, league, matchID):
        match_url = f'https://www.oddsportal.com/soccer/{country}/{league}/{matchID}/'
        content = fetch_url(match_url)

        tree = parse_tree(content)
        xhash_text = str(tree.find('html').find('body').find('script'))
        xhash_label = '"xhash":"'
        i1, i2 = xhash_text.find(xhash_label) + len(
            xhash_label), xhash_text.find('","xhashf"')
        if i1 < 0 or i2 < i1:
            return None

        xhash = unquote(xhash_text[i1:i2])
        main_node = tree.find('html').find('body').find('div').find(
            'div', {
                'id': 'mother-main'
            }).find('div', {
                'id': 'mother'
            }).find('div', {
                'id': 'wrap'
            }).find('div').find('div').find('div', {
                'id': 'main'
            }).find('div', {'id': 'col-content'})

        time = int(main_node.find('p')['class'][2][1:11])
        time_dt = datetime.utcfromtimestamp(time)
        if self.__do_time_filter(time_dt):
            return None

        teams_text = main_node.find('h1').text
        teams = [team.strip() for team in teams_text.split('-')]
        team_home, team_away = teams[0], teams[1]

        score_node = main_node.find('div', {
            'id': 'event-status'
        }).find('strong')
        was_extra = False
        if score_node is None:
            score_home, score_away = None, None
        else:
            score = score_node.text
            score, was_pen = self.correct_score(score, 'penalties')
            score, was_et = self.correct_score(score, 'ET')
            score, was_ot = self.correct_score(score, 'OT')
            score_home, score_away = self.parse_score(score)
            was_extra = was_pen or was_et or was_ot

        periods_node = main_node.find('div', {
            'id': 'event-status'
        }).find('p', {'class': 'result'})
        period1_home, period1_away, period2_home, period2_away = None, None, None, None
        if periods_node is not None:
            periods_text = periods_node.text
            i1, i2 = periods_text.find('(') + 1, periods_text.find(')')
            periods_text = periods_text[i1:i2]
            periods = [period.strip() for period in periods_text.split(',')]
            if len(periods) >= 2:
                (period1_home,
                 period1_away), (period2_home,
                                 period2_away) = self.parse_score(
                                     periods[0]), self.parse_score(periods[1])

        match_info = {
            'match_id': matchID,
            'time': time,
            'team_home': team_home,
            'team_away': team_away,
            'score_home': score_home,
            'score_away': score_away,
            'was_extra': was_extra,
            'score_home_period1': period1_home,
            'score_away_period1': period1_away,
            'score_home_period2': period2_home,
            'score_away_period2': period2_away
        }
        self.__scrap_odds(match_info, matchID, xhash)
        return match_info
Example #10
0
    def validate(self):
        logging.info('Validating tree svm.', extra=d)
        treekernel = features.TreeKernel()
        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(self.devset):
            ranking[q1id] = []
            percentage = round(float(i + 1) / len(self.devset), 2)

            query = self.devset[q1id]
            q1_token2lemma = dict(zip(query['tokens'], query['lemmas']))
            q1_tree = utils.binarize(
                utils.parse_tree(query['tree'], q1_token2lemma))

            q1_w2v = features.encode(query['tokens'], self.word2vec)
            q1_elmo = self.fulldevelmo.get(str(self.fulldevidx[q1id]))
            q1_emb = [
                np.concatenate([q1_w2v[i], q1_elmo[i]])
                for i in range(len(q1_w2v))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']
                # tree kernel
                q2_token2lemma = dict(
                    zip(rel_question['tokens'], rel_question['lemmas']))
                q2_tree = utils.binarize(
                    utils.parse_tree(rel_question['tree'], q2_token2lemma))

                # word2vec vectors
                q2_w2v = features.encode(rel_question['tokens'], self.word2vec)
                q2_elmo = self.fulldevelmo.get(str(self.fulldevidx[q2id]))
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                q1_tree, q2_tree = treekernel.similar_terminals(
                    q1_tree, q2_tree)

                X = []
                for j, trainrow in enumerate(self.traindata):
                    c1id, c2id = trainrow['q1_id'], trainrow['q2_id']
                    c1_token2lemma = dict(
                        zip(trainrow['q1_full'], trainrow['q1_lemmas']))
                    c2_token2lemma = dict(
                        zip(trainrow['q2_full'], trainrow['q2_lemmas']))
                    c1_tree = utils.binarize(
                        utils.parse_tree(trainrow['q1_tree'], c1_token2lemma))
                    c2_tree = utils.binarize(
                        utils.parse_tree(trainrow['q2_tree'], c2_token2lemma))

                    # word2vec vectors
                    c1_w2v = features.encode(trainrow['q1_full'],
                                             self.word2vec)
                    c1_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c1id]))
                    c1_emb = [
                        np.concatenate([c1_w2v[i], c1_elmo[i]])
                        for i in range(len(c1_w2v))
                    ]

                    c2_w2v = features.encode(trainrow['q2_full'],
                                             self.word2vec)
                    c2_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c2id]))
                    c2_emb = [
                        np.concatenate([c2_w2v[i], c2_elmo[i]])
                        for i in range(len(c2_w2v))
                    ]

                    c1_tree, c2_tree = treekernel.similar_terminals(
                        c1_tree, c2_tree)

                    kq1 = self.memoize(q1id, q1_tree, q1_emb, q1id, q1_tree,
                                       q1_emb, treekernel)
                    kc1 = self.memoize(c1id, c1_tree, c1_emb, c1id, c1_tree,
                                       c1_emb, treekernel)
                    kq1c1 = float(
                        self.memoize(q1id, q1_tree, q1_emb, c1id,
                                     c1_tree, c1_emb, treekernel)) / np.sqrt(
                                         kq1 * kc1)  # normalized

                    kq2 = self.memoize(q2id, q2_tree, q2_emb, q2id, q2_tree,
                                       q2_emb, treekernel)
                    kc2 = self.memoize(c2id, c2_tree, c2_emb, c2id, c2_tree,
                                       c2_emb, treekernel)
                    kq2c2 = float(
                        self.memoize(q2id, q2_tree, q2_emb, c2id,
                                     c2_tree, c2_emb, treekernel)) / np.sqrt(
                                         kq2 * kc2)  # normalized

                    # kq1c2 = float(self.memoize(q1id, q1_tree, q1_emb, c2id, c2_tree, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized
                    # kq2c1 = float(self.memoize(q2id, q2_tree, q2_emb, c1id, c1_tree, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized

                    k = kq1c1 + kq2c2
                    X.append(k)
                print('Progress: ', percentage, i + 1, sep='\t', end='\r')

                score = self.model.decision_function([X])[0]
                pred_label = self.model.predict([X])[0]
                y_pred.append(pred_label)

                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        with open('data/treeranking.txt', 'w') as f:
            for qid in ranking:
                for row in ranking[qid]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(qid),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate tree svm.', extra=d)
        return ranking, y_real, y_pred
Example #11
0
    def train(self):
        logging.info('Training tree svm.', extra=d)
        treekernel = features.TreeKernel()

        if not os.path.exists(KERNEL_PATH):
            X, y = [], []
            for i, q in enumerate(self.traindata):
                percentage = round(float(i + 1) / len(self.traindata), 2)
                x = []
                q1id, q2id = q['q1_id'], q['q2_id']
                # trees
                q1_token2lemma = dict(zip(q['q1_full'], q['q1_lemmas']))
                q2_token2lemma = dict(zip(q['q2_full'], q['q2_lemmas']))
                q1 = utils.binarize(
                    utils.parse_tree(q['q1_tree'], q1_token2lemma))
                q2 = utils.binarize(
                    utils.parse_tree(q['q2_tree'], q2_token2lemma))

                # word2vec and elmo vectors
                q1_w2v = features.encode(q['q1_full'], self.word2vec)
                q1_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q1id]))
                q1_emb = [
                    np.concatenate([q1_w2v[i], q1_elmo[i]])
                    for i in range(len(q1_w2v))
                ]

                q2_w2v = features.encode(q['q2_full'], self.word2vec)
                q2_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q2id]))
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                q1, q2 = treekernel.similar_terminals(q1, q2)
                for j, c in enumerate(self.traindata):
                    c1id, c2id = c['q1_id'], c['q2_id']
                    # trees
                    c1_token2lemma = dict(zip(c['q1_full'], c['q1_lemmas']))
                    c2_token2lemma = dict(zip(c['q2_full'], c['q2_lemmas']))
                    c1 = utils.binarize(
                        utils.parse_tree(c['q1_tree'], c1_token2lemma))
                    c2 = utils.binarize(
                        utils.parse_tree(c['q2_tree'], c2_token2lemma))
                    # word2vec vectors
                    c1_w2v = features.encode(c['q1_full'], self.word2vec)
                    c1_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c1id]))
                    c1_emb = [
                        np.concatenate([c1_w2v[i], c1_elmo[i]])
                        for i in range(len(c1_w2v))
                    ]

                    c2_w2v = features.encode(c['q2_full'], self.word2vec)
                    c2_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c2id]))
                    c2_emb = [
                        np.concatenate([c2_w2v[i], c2_elmo[i]])
                        for i in range(len(c2_w2v))
                    ]

                    c1, c2 = treekernel.similar_terminals(c1, c2)
                    kq1 = self.memoize(q1id, q1, q1_emb, q1id, q1, q1_emb,
                                       treekernel)
                    kc1 = self.memoize(c1id, c1, c1_emb, c1id, c1, c1_emb,
                                       treekernel)
                    kq1c1 = float(
                        self.memoize(q1id, q1, q1_emb, c1id,
                                     c1, c1_emb, treekernel)) / np.sqrt(
                                         kq1 * kc1)  # normalized

                    kq2 = self.memoize(q2id, q2, q2_emb, q2id, q2, q2_emb,
                                       treekernel)
                    kc2 = self.memoize(c2id, c2, c2_emb, c2id, c2, c2_emb,
                                       treekernel)
                    kq2c2 = float(
                        self.memoize(q2id, q2, q2_emb, c2id,
                                     c2, c2_emb, treekernel)) / np.sqrt(
                                         kq2 * kc2)  # normalized

                    # kq1c2 = float(self.memoize(q1id, q1, q1_emb, c2id, c2, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized
                    # kq2c1 = float(self.memoize(q2id, q2, q2_emb, c1id, c1, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized

                    k = kq1c1 + kq2c2
                    x.append(k)
                    print('Preparing kernel: ',
                          percentage,
                          i + 1,
                          j + 1,
                          sep='\t',
                          end='\r')
                X.append(x)
                y.append(q['label'])
            p.dump(list(zip(X, y)), open(KERNEL_PATH, 'wb'))
            X = np.array(X)
        else:
            f = p.load(open(KERNEL_PATH, 'rb'))
            X = np.array([x[0] for x in f])
            y = list(map(lambda x: x[1], f))

        self.model = self.train_svm(trainvectors=X,
                                    labels=y,
                                    c='search',
                                    kernel='precomputed',
                                    gamma='search',
                                    jobs=4)
        logging.info('Finishing to train tree svm.', extra=d)