def test(self, testset): ranking = {} y_real, y_pred = [], [] for i, qid in enumerate(testset): ranking[qid] = [] percentage = round(float(i + 1) / len(testset), 2) print('Progress: ', percentage, sep='\t', end='\r') q1 = testset[qid] q1 = utils.binarize(utils.parse_tree(q1['tree'])) q1_vec, _ = self.expr_for_tree(q1['root'], q1) duplicates = testset[qid]['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] rel_question_id = rel_question['id'] q2 = rel_question['tree'] q2 = utils.binarize(utils.parse_tree(q2)) q2_vec, _ = self.expr_for_tree(q2['root'], q2) x = dy.concatenate([q1_vec, q2_vec]) probs = dy.softmax(self.W * x + self.bW).vec_value() score = probs.index(max(probs)) y_pred.append(score) if rel_question['relevance'] != 'Irrelevant': y_real.append(1) else: y_real.append(0) ranking[qid].append((score, score, rel_question_id)) dy.renew_cg() gold = utils.prepare_gold(GOLD_PATH) map_baseline, map_model = utils.evaluate(gold, ranking) f1score = f1_score(y_real, y_pred) return map_baseline, map_model, f1score
def handle_client(conn, addr): print(f'[NEW CONNECTION] {addr} connected.') connected = True while connected: text = [] msg_length = conn.recv(HEADER).decode(FORMAT) if msg_length: msg_length = int(msg_length) print(f"msgl = {msg_length}") msg = conn.recv(msg_length).decode(FORMAT) if msg == DISCONNECT_MSG: connected = False break print(f"[{addr}] : {msg}") text.append(str(msg)) ws = ws_driver(text) pos = pos_driver(ws) parse_input = prepro(ws, pos) ParsTree = parse_tree(ps.apply_list(parse_input)[0]) root = pt.make_tree(ParsTree) t = pt.list_of_leaves(root) k = pt.getkeywords(t) keywords = "" for word in k: keywords += " " + word # ParsTree = CoreN.get_parse_tree(t2s(msg)) # NodeTree = make_tree(ParsTree) # Leaves = list_of_leaves(NodeTree) # keywords = inorder(Leaves) # keywords = s2t(keywords) if (keywords == ""): keywords = "None" print(keywords) conn.send(keywords.encode(FORMAT)) conn.close()
def scrap_match_ids(self, league_id): page_num = 1 match_ids = [] while True: league_data_url = f'/ajax-sport-country-tournament-archive/1/{league_id}/X0/1/0/{page_num}/' json_value = self.fetch_url(league_data_url) json_value = json_value['d']['html'] tree = parse_tree(json_value) tbody_node = tree.find('html').find('body').find('table').find( 'tbody') if tbody_node is None: break trs = tbody_node.find_all('tr') new_match_ids = [tr['xeid'] for tr in trs if tr.has_attr('xeid')] if len(new_match_ids) == 0: break match_ids.extend(new_match_ids) page_num += 1 match_ids.reverse() return match_ids
async def alert_init(client): plugins = [(handler.user_callback if hasattr(handler, 'user_callback') else handler.callback) for group in client.dispatcher.groups.values() for handler in group] plugins_count = len(plugins) plugins_names = [] for plugin_callback in plugins: members = { key: value for key, value in inspect.getmembers(plugin_callback) } full_name = f"{members['__globals__']['__name__']}.{members['__name__']}" plugins_names.append(full_name) plugins_text = utils.tree(utils.parse_tree(plugins_names)) started_text = config.langs.start_log(plugins_count=plugins_count, plugins_names=plugins_names, plugins_text=plugins_text, client=client) await client.send_message(logs_chat, started_text)
def update_state_hierarchical(self, groundtruth_data, detections, hierarchical_scores, hierarchical_classes, tree_filename): """Update detection results and groundtruth data. Similar to update_state(), difference in this function is that it walks up the hierarchy and resets the predicted class, in order to match with the groundtruth. For example, given gt as 'Person', if the nearest bbox prediction is 'Girl', we walk up the tree to find if 'Person' exists in parents. If it exists, set the nearest predicted bbox class as 'Person', as for the score, sum up corresponding leaf predicted score of that bbox. In this case, Person has leaves as ['Woman', 'Man', 'Boy', 'Girl']. We sum up scores of the four classes, take min(new_score, 1.0), set the bbox score as that value. Args: groundtruth_data: Groundtruth annotations in a tensor with each row representing [y1, x1, y2, x2, class]. detections: Detection results in a tensor with each row representing [image_id, x, y, width, height, score, class]. hierarchical_scores: [batch_size, num_boxes, num_classes], note:num_boxes is remaining bboxes after nms hierarchical_classes: [batch_size, num_boxes, num_classes]. tree_filename: string file name. """ tree_leaf2root, sumrule = utils.parse_tree(tree_filename) batch_parent_scores = [] batch_parent_classes = [] new_bbox_scores = hierarchical_scores[:, :, 0] new_bbox_classes = hierarchical_classes[:, :, 0] for i, det in enumerate(detections): # Filter out detections with the top1 predicted class label = -1. indices = np.where(hierarchical_classes[i, :, 0] > -1)[0] hierarchical_classes[i][indices].astype(int) det = det[indices] if det.shape[0] == 0: continue # Append groundtruth annotations to create COCO dataset object. # Add images. image_id = det[0, 0] if image_id == -1: image_id = self.image_id det[:, 0] = image_id max_levels = 3 per_bbox_parent_scores = [] # each leaf score corresponds to maximum of max_level parents per_bbox_parent_classes = [] # find parent scores and classes for each bbox for _scores, _classes in zip(hierarchical_scores[i], hierarchical_classes[i]): per_class_parent_scores = [] per_class_parent_classes = [] for _s, _c in zip(_scores, _classes): parent_scores = [-1] * max_levels parent_classes = [-1] * max_levels parents = self.get_leaf_to_parent_path(tree_leaf2root, _c)[:max_levels] np_classes = np.asarray(_classes) np_scores = np.asarray(_scores) for _ii, _p in enumerate(parents): np_leaves = np.asarray(sumrule[_p]) overlap_leaves, indices1, indices2 = np.intersect1d( np_leaves, np_classes, return_indices=True) _p_score = min(np_scores[indices2].sum(), 1.0) parent_scores[_ii] = _p_score parent_classes[_ii] = _p per_class_parent_scores.append(parent_scores) per_class_parent_classes.append(parent_classes) per_bbox_parent_scores.append(per_class_parent_scores) per_bbox_parent_classes.append(per_class_parent_classes) batch_parent_scores.append(per_bbox_parent_scores) batch_parent_classes.append(per_bbox_parent_classes) if not self.filename and not self.testdev_dir: # process groudtruth data only if filename is empty and no test_dev. self.dataset['images'].append({ 'id': int(image_id), }) # Add annotations. indices = np.where(groundtruth_data[i, :, -1] > -1)[0] for data in groundtruth_data[i, indices]: box = data[0:4] category_id = data[4] area = (box[3] - box[1]) * (box[2] - box[0]) if category_id < 0: break # find predicted bbox that has the largest IOU with gt, reset its' prediction _iou, _iou_max, _idx = self.get_max_iou(detections[i][:, 1:5], box) matched_bbox_parent_classes = per_bbox_parent_classes[_idx] #shape: (k, max_levels) matched_bbox_parent_classes = np.asarray(matched_bbox_parent_classes) matched_bbox_parent_scores = per_bbox_parent_scores[_idx] matched_bbox_parent_scores = np.asarray(matched_bbox_parent_scores) parent_match_bbox_idx, parent_match_class_idx = np.where( matched_bbox_parent_classes == category_id) # get the top1 score for a matching parent class if len(parent_match_bbox_idx) > 0 and len(parent_match_class_idx) > 0: parent_match_bbox_idx = min(parent_match_bbox_idx) parent_match_class_idx = min(parent_match_class_idx) new_bbox_classes[i][_idx] = matched_bbox_parent_classes[parent_match_bbox_idx][ parent_match_class_idx] new_bbox_scores[i][_idx] = matched_bbox_parent_scores[parent_match_bbox_idx][ parent_match_class_idx] self.dataset['annotations'].append({ 'id': int(self.annotation_id), 'image_id': int(image_id), 'iscrowd': False, 'category_id': int(category_id), 'bbox': [box[1], box[0], box[3] - box[1], box[2] - box[0]], 'area': area, }) self.annotation_id += 1 self.category_ids.append(category_id) det[:, 5] = new_bbox_scores[i] det[:, 6] = new_bbox_classes[i] self.detections.extend(det) self.image_id += 1 if not self.filename: self.category_ids = list(set(self.category_ids)) self.dataset['categories'] = [{'id': int(category_id)} for category_id in self.category_ids]
def train(self): dy.renew_cg() trainer = dy.AdamTrainer(self.model) early = 0.0 best = -1 for epoch in range(self.EPOCH): print('\n') dy.renew_cg() losses = [] closs = 0 batch_timing = [] for i, trainrow in enumerate(self.traindata): start = time.time() q1 = utils.binarize(utils.parse_tree(trainrow['q1_tree'])) q2 = utils.binarize(utils.parse_tree(trainrow['q2_tree'])) label = trainrow['label'] loss = self.get_loss(q1, q2, label) losses.append(loss) if len(losses) == self.BATCH: loss = dy.esum(losses) # loss += self.regularization_loss() _loss = loss.value() closs += _loss loss.backward() trainer.update() dy.renew_cg() # percentage of trainset processed percentage = str( round((float(i + 1) / len(self.traindata)) * 100, 2)) + '%' # time of epoch processing time_epoch = round( sum(batch_timing) / float(len(batch_timing)), 2) print( "Epoch: {0} \t\t Loss: {1} \t\t Epoch time: {2} \t\t Trainset: {3}" .format(epoch + 1, round(_loss, 2), time_epoch, percentage), end=' \r') losses = [] batch_timing = [] end = time.time() t = (end - start) batch_timing.append(t) log = "Epoch: {0} \t\t Loss: {1} \t\t Best: {2}".format( epoch + 1, round(closs / self.BATCH, 2), round(best, 2)) print('\n' + log) log = 'Dev evaluation...' print(log) map_baseline, map_model, f1score = self.test(self.devset) print('MAP Model: ', round(map_model, 2), 'MAP baseline: ', round(map_baseline, 2), 'F1 score: ', str(round(f1score, 2)), sep='\t', end='\n') trainer.learning_rate *= 0.95 if map_model > best: best = copy.copy(map_model) early = 0 # path = self.fname() + '.dy' # self.model.save(os.path.join(EVALUATION_PATH, path)) else: early += 1 if early == self.EARLY_STOP: break
import socket import re import threading from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger from utils_parser.parser_util import get_parser_tree_cls, NodeType import ckip_classic.client from utils import prepro, parse_tree msg = '如果是在夜間或其他時後遭到家庭暴力' text = [] # Initialize drivers with custom checkpoints ws_driver = CkipWordSegmenter(level=3, device=0) pos_driver = CkipPosTagger(level=3, device=0) ps = ckip_classic.client.CkipParserClient(username='******', password='******') pt = get_parser_tree_cls(NodeType.Origin_Ckip) text.append(str(msg)) ws = ws_driver(text) pos = pos_driver(ws) parse_input = prepro(ws, pos) print(parse_input) ParsTree = parse_tree(ps.apply_list(parse_input)[0]) print("pas", ParsTree) root = pt.make_tree(ParsTree) t = pt.list_of_leaves(root) k = pt.get_keywords(t) keywords = "" for word in k: keywords += " " + word print(keywords)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ utils.image('input_image', features) training_hooks = [] def _model_outputs(inputs): # Convert params (dict) to Config for easier access. return model(inputs, config=hparams_config.Config(params)) precision = utils.get_precision(params['strategy'], params['mixed_precision']) cls_outputs, box_outputs = utils.build_model_with_precision( precision, _model_outputs, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) sumrule = None if params.get('tree'): _, sumrule = utils.parse_tree(params['tree']) params['sumrule'] = sumrule # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) if params['iou_loss_type']: utils.scalar('trainloss/box_iou_loss', box_iou_loss) train_epochs = tf.cast(global_step, tf.float32) / params['steps_per_epoch'] utils.scalar('train_epochs', train_epochs) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if params['strategy'] == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top learning_rate = learning_rate * hvd.size() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) elif params['strategy'] == 'horovod': optimizer = hvd.DistributedOptimizer(optimizer) training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', False): detections_bs = [] for index in range(kwargs['boxes'].shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. if params.get('tree'): nms_boxes, nms_scores, nms_classes, _ = postprocess.hierarchical_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['image_scales'], k=10) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_boxes[:, :, 1], dtype=tf.float32), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores[:, :, 0], nms_classes[:, :, 0], ] else: nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file']) if params.get('tree', None): #TODO: Add in hierarchy file here? coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data'], nms_scores, nms_classes, params['tree']) else: coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) cls_outputs = postprocess.to_list(cls_outputs) box_outputs = postprocess.to_list(box_outputs) params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS if params.get('tree'): boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs, topk=False) else: boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs, topk=True) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'image_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], 'boxes': boxes, 'scores': scores, 'classes': classes, } eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, skip_mismatch=params['skip_mismatch']) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if params['strategy'] != 'tpu': # Profile every 1K steps. profile_hook = tf.train.ProfilerHook(save_steps=1000, output_dir=params['model_dir']) training_hooks.append(profile_hook) # Report memory allocation if OOM class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks)
def scrap(self, country, league, matchID): match_url = f'https://www.oddsportal.com/soccer/{country}/{league}/{matchID}/' content = fetch_url(match_url) tree = parse_tree(content) xhash_text = str(tree.find('html').find('body').find('script')) xhash_label = '"xhash":"' i1, i2 = xhash_text.find(xhash_label) + len( xhash_label), xhash_text.find('","xhashf"') if i1 < 0 or i2 < i1: return None xhash = unquote(xhash_text[i1:i2]) main_node = tree.find('html').find('body').find('div').find( 'div', { 'id': 'mother-main' }).find('div', { 'id': 'mother' }).find('div', { 'id': 'wrap' }).find('div').find('div').find('div', { 'id': 'main' }).find('div', {'id': 'col-content'}) time = int(main_node.find('p')['class'][2][1:11]) time_dt = datetime.utcfromtimestamp(time) if self.__do_time_filter(time_dt): return None teams_text = main_node.find('h1').text teams = [team.strip() for team in teams_text.split('-')] team_home, team_away = teams[0], teams[1] score_node = main_node.find('div', { 'id': 'event-status' }).find('strong') was_extra = False if score_node is None: score_home, score_away = None, None else: score = score_node.text score, was_pen = self.correct_score(score, 'penalties') score, was_et = self.correct_score(score, 'ET') score, was_ot = self.correct_score(score, 'OT') score_home, score_away = self.parse_score(score) was_extra = was_pen or was_et or was_ot periods_node = main_node.find('div', { 'id': 'event-status' }).find('p', {'class': 'result'}) period1_home, period1_away, period2_home, period2_away = None, None, None, None if periods_node is not None: periods_text = periods_node.text i1, i2 = periods_text.find('(') + 1, periods_text.find(')') periods_text = periods_text[i1:i2] periods = [period.strip() for period in periods_text.split(',')] if len(periods) >= 2: (period1_home, period1_away), (period2_home, period2_away) = self.parse_score( periods[0]), self.parse_score(periods[1]) match_info = { 'match_id': matchID, 'time': time, 'team_home': team_home, 'team_away': team_away, 'score_home': score_home, 'score_away': score_away, 'was_extra': was_extra, 'score_home_period1': period1_home, 'score_away_period1': period1_away, 'score_home_period2': period2_home, 'score_away_period2': period2_away } self.__scrap_odds(match_info, matchID, xhash) return match_info
def validate(self): logging.info('Validating tree svm.', extra=d) treekernel = features.TreeKernel() ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(self.devset): ranking[q1id] = [] percentage = round(float(i + 1) / len(self.devset), 2) query = self.devset[q1id] q1_token2lemma = dict(zip(query['tokens'], query['lemmas'])) q1_tree = utils.binarize( utils.parse_tree(query['tree'], q1_token2lemma)) q1_w2v = features.encode(query['tokens'], self.word2vec) q1_elmo = self.fulldevelmo.get(str(self.fulldevidx[q1id])) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] # tree kernel q2_token2lemma = dict( zip(rel_question['tokens'], rel_question['lemmas'])) q2_tree = utils.binarize( utils.parse_tree(rel_question['tree'], q2_token2lemma)) # word2vec vectors q2_w2v = features.encode(rel_question['tokens'], self.word2vec) q2_elmo = self.fulldevelmo.get(str(self.fulldevidx[q2id])) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] q1_tree, q2_tree = treekernel.similar_terminals( q1_tree, q2_tree) X = [] for j, trainrow in enumerate(self.traindata): c1id, c2id = trainrow['q1_id'], trainrow['q2_id'] c1_token2lemma = dict( zip(trainrow['q1_full'], trainrow['q1_lemmas'])) c2_token2lemma = dict( zip(trainrow['q2_full'], trainrow['q2_lemmas'])) c1_tree = utils.binarize( utils.parse_tree(trainrow['q1_tree'], c1_token2lemma)) c2_tree = utils.binarize( utils.parse_tree(trainrow['q2_tree'], c2_token2lemma)) # word2vec vectors c1_w2v = features.encode(trainrow['q1_full'], self.word2vec) c1_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c1id])) c1_emb = [ np.concatenate([c1_w2v[i], c1_elmo[i]]) for i in range(len(c1_w2v)) ] c2_w2v = features.encode(trainrow['q2_full'], self.word2vec) c2_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c2id])) c2_emb = [ np.concatenate([c2_w2v[i], c2_elmo[i]]) for i in range(len(c2_w2v)) ] c1_tree, c2_tree = treekernel.similar_terminals( c1_tree, c2_tree) kq1 = self.memoize(q1id, q1_tree, q1_emb, q1id, q1_tree, q1_emb, treekernel) kc1 = self.memoize(c1id, c1_tree, c1_emb, c1id, c1_tree, c1_emb, treekernel) kq1c1 = float( self.memoize(q1id, q1_tree, q1_emb, c1id, c1_tree, c1_emb, treekernel)) / np.sqrt( kq1 * kc1) # normalized kq2 = self.memoize(q2id, q2_tree, q2_emb, q2id, q2_tree, q2_emb, treekernel) kc2 = self.memoize(c2id, c2_tree, c2_emb, c2id, c2_tree, c2_emb, treekernel) kq2c2 = float( self.memoize(q2id, q2_tree, q2_emb, c2id, c2_tree, c2_emb, treekernel)) / np.sqrt( kq2 * kc2) # normalized # kq1c2 = float(self.memoize(q1id, q1_tree, q1_emb, c2id, c2_tree, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized # kq2c1 = float(self.memoize(q2id, q2_tree, q2_emb, c1id, c1_tree, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized k = kq1c1 + kq2c2 X.append(k) print('Progress: ', percentage, i + 1, sep='\t', end='\r') score = self.model.decision_function([X])[0] pred_label = self.model.predict([X])[0] y_pred.append(pred_label) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) with open('data/treeranking.txt', 'w') as f: for qid in ranking: for row in ranking[qid]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(qid), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate tree svm.', extra=d) return ranking, y_real, y_pred
def train(self): logging.info('Training tree svm.', extra=d) treekernel = features.TreeKernel() if not os.path.exists(KERNEL_PATH): X, y = [], [] for i, q in enumerate(self.traindata): percentage = round(float(i + 1) / len(self.traindata), 2) x = [] q1id, q2id = q['q1_id'], q['q2_id'] # trees q1_token2lemma = dict(zip(q['q1_full'], q['q1_lemmas'])) q2_token2lemma = dict(zip(q['q2_full'], q['q2_lemmas'])) q1 = utils.binarize( utils.parse_tree(q['q1_tree'], q1_token2lemma)) q2 = utils.binarize( utils.parse_tree(q['q2_tree'], q2_token2lemma)) # word2vec and elmo vectors q1_w2v = features.encode(q['q1_full'], self.word2vec) q1_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q1id])) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] q2_w2v = features.encode(q['q2_full'], self.word2vec) q2_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q2id])) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] q1, q2 = treekernel.similar_terminals(q1, q2) for j, c in enumerate(self.traindata): c1id, c2id = c['q1_id'], c['q2_id'] # trees c1_token2lemma = dict(zip(c['q1_full'], c['q1_lemmas'])) c2_token2lemma = dict(zip(c['q2_full'], c['q2_lemmas'])) c1 = utils.binarize( utils.parse_tree(c['q1_tree'], c1_token2lemma)) c2 = utils.binarize( utils.parse_tree(c['q2_tree'], c2_token2lemma)) # word2vec vectors c1_w2v = features.encode(c['q1_full'], self.word2vec) c1_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c1id])) c1_emb = [ np.concatenate([c1_w2v[i], c1_elmo[i]]) for i in range(len(c1_w2v)) ] c2_w2v = features.encode(c['q2_full'], self.word2vec) c2_elmo = self.fulltrainelmo.get( str(self.fulltrainidx[c2id])) c2_emb = [ np.concatenate([c2_w2v[i], c2_elmo[i]]) for i in range(len(c2_w2v)) ] c1, c2 = treekernel.similar_terminals(c1, c2) kq1 = self.memoize(q1id, q1, q1_emb, q1id, q1, q1_emb, treekernel) kc1 = self.memoize(c1id, c1, c1_emb, c1id, c1, c1_emb, treekernel) kq1c1 = float( self.memoize(q1id, q1, q1_emb, c1id, c1, c1_emb, treekernel)) / np.sqrt( kq1 * kc1) # normalized kq2 = self.memoize(q2id, q2, q2_emb, q2id, q2, q2_emb, treekernel) kc2 = self.memoize(c2id, c2, c2_emb, c2id, c2, c2_emb, treekernel) kq2c2 = float( self.memoize(q2id, q2, q2_emb, c2id, c2, c2_emb, treekernel)) / np.sqrt( kq2 * kc2) # normalized # kq1c2 = float(self.memoize(q1id, q1, q1_emb, c2id, c2, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized # kq2c1 = float(self.memoize(q2id, q2, q2_emb, c1id, c1, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized k = kq1c1 + kq2c2 x.append(k) print('Preparing kernel: ', percentage, i + 1, j + 1, sep='\t', end='\r') X.append(x) y.append(q['label']) p.dump(list(zip(X, y)), open(KERNEL_PATH, 'wb')) X = np.array(X) else: f = p.load(open(KERNEL_PATH, 'rb')) X = np.array([x[0] for x in f]) y = list(map(lambda x: x[1], f)) self.model = self.train_svm(trainvectors=X, labels=y, c='search', kernel='precomputed', gamma='search', jobs=4) logging.info('Finishing to train tree svm.', extra=d)