def test_unzip(): def _to_lists(seq, n=10): """iter of iters -> finite list of finite lists """ def initial(s): return list(take(n, s)) return initial(map(initial, seq)) def _assert_initial_matches(a, b, n=10): assert list(take(n, a)) == list(take(n, b)) # Unzips a simple list correctly assert _to_lists(unzip([('a', 1), ('b', 2), ('c', 3)])) \ == [['a', 'b', 'c'], [1, 2, 3]] # Can handle a finite number of infinite iterators (the naive unzip # implementation `zip(*args)` impelementation fails on this example). a, b, c = unzip(zip(count(1), repeat(0), repeat(1))) _assert_initial_matches(a, count(1)) _assert_initial_matches(b, repeat(0)) _assert_initial_matches(c, repeat(1)) # Sensibly handles empty input assert list(unzip(zip([]))) == []
def test_unzip(): def _to_lists(seq, n=10): """iter of iters -> finite list of finite lists """ def initial(s): return list(take(n, s)) return initial(map(initial, seq)) def _assert_initial_matches(a, b, n=10): assert list(take(n, a)) == list(take(n, b)) # Unzips a simple list correctly assert _to_lists(unzip([('a', 1), ('b', 2), ('c', 3)])) \ == [['a', 'b', 'c'], [1, 2, 3]] # Can handle a finite number of infinite iterators (the naive unzip # implementation `zip(*args)` impelementation fails on this example). a, b, c = unzip(zip(count(1), repeat(0), repeat(1))) _assert_initial_matches(a, count(1)) _assert_initial_matches(b, repeat(0)) _assert_initial_matches(c, repeat(1)) # Sensibly handles empty input assert list(unzip(zip([]))) == []
def coll_stride(tokenizer, batch, max_len=1024, stride=256): def is_good_data(d): """ make sure data is not empty""" source_sents, extracts = d return source_sents and extracts @curry def prepro(tokenizer, d, max_len=1024, stride=256): """ make sure data is not empty""" source_sents, extracts = d tokenized_sents = [ tokenizer.tokenize(source_sent.lower()) for source_sent in source_sents ] tokenized_sents = [['[CLS]'] + tokenized_sent for tokenized_sent in tokenized_sents] tokenized_sents = [ tokenizer.convert_tokens_to_ids(tokenized_sent) for tokenized_sent in tokenized_sents ] word_num = [ len(tokenized_sent) for tokenized_sent in tokenized_sents ] truncated_word_num = [] total_count = 0 for num in word_num: if total_count + num < max_len: truncated_word_num.append(num) else: truncated_word_num.append(max_len - total_count) break total_count += num tokenized_sents = list(concat(tokenized_sents))[:max_len] tokenized_sents_lists = [tokenized_sents[:BERT_MAX_LEN]] length = len(tokenized_sents) - BERT_MAX_LEN i = 1 while length > 0: tokenized_sents_lists.append( tokenized_sents[(i * BERT_MAX_LEN - stride):((i + 1) * BERT_MAX_LEN - stride)]) i += 1 length -= (BERT_MAX_LEN - stride) abs_sents = tokenize(None, extracts) art_sents = tokenize(None, source_sents) return (art_sents, tokenized_sents_lists, truncated_word_num), abs_sents art_batch, abs_batch = unzip(batch) art_batch, abs_batch = list( zip(*list(filter(is_good_data, zip(art_batch, abs_batch))))) art_sents, abs_sents = list( zip(*list( map(prepro(tokenizer, max_len=max_len, stride=stride), zip(art_batch, abs_batch))))) return art_sents, abs_sents
def coll(batch): art_batch, abs_batch, query_batch = unzip(batch) query_batch_list = list(query_batch) art_sents = list(filter(bool, map(tokenize(None), art_batch))) abs_sents = list(filter(bool, map(tokenize(None), abs_batch))) queries = list( filter( bool, map(tokenize(None), [[query] for query in query_batch_list]))) return art_sents, abs_sents, queries
def coll(batch): def is_good_data(d): """ make sure data is not empty""" source_sents, extracts = d return source_sents and extracts art_batch, abs_batch = unzip(batch) art_batch, abs_batch = list( zip(*list(filter(is_good_data, zip(art_batch, abs_batch))))) art_sents = list(filter(bool, map(tokenize(None), art_batch))) abs_sents = list(filter(bool, map(tokenize(None), abs_batch))) return art_sents, abs_sents
def position_random_with_filter(tokens, filer_fn, window_size=1): windowed_tokens = more_itertools.windowed(tokens, window_size) # for t in tokens: # print("tokens:{}, {}".format(t.value, filer_fn([t]))) random_pos_list = unzip(filter(lambda x:filer_fn(x[1]), enumerate(windowed_tokens))) if random_pos_list: random_pos_list = list(random_pos_list[0]) else: raise NotFoundChangePositionException pos = random.sample(random_pos_list, k=1)[0] token = tokens[pos] return token.lexpos, pos
def pad_collate(features): """ pad the input features to same length""" input_ids, input_masks, segment_ids, lm_label_ids = map( list, unzip(features)) max_len = max(map(len, input_ids)) for ids, masks, segs, labels in zip(input_ids, input_masks, segment_ids, lm_label_ids): while len(ids) < max_len: ids.append(0) masks.append(0) segs.append(0) labels.append(-1) input_ids = torch.tensor(input_ids) input_mask = torch.tensor(input_masks) segment_ids = torch.tensor(segment_ids) lm_label_ids = torch.tensor(lm_label_ids) return input_ids, input_mask, segment_ids, lm_label_ids
def coll(tokenizer, batch): def is_good_data(d): """ make sure data is not empty""" source_sents, extracts = d return source_sents and extracts @curry def prepro(tokenizer, d, max_len=512): """ make sure data is not empty""" source_sents, extracts = d tokenized_sents = [ tokenizer.tokenize(source_sent.lower()) for source_sent in source_sents ] tokenized_sents = [ tokenized_sent + ['[SEP]'] for tokenized_sent in tokenized_sents ] tokenized_sents[0] = ['[CLS]'] + tokenized_sents[0] word_num = [ len(tokenized_sent) for tokenized_sent in tokenized_sents ] truncated_word_num = [] total_count = 0 for num in word_num: if total_count + num < max_len: truncated_word_num.append(num) else: truncated_word_num.append(512 - total_count) break total_count += num tokenized_sents = list(concat(tokenized_sents))[:max_len] tokenized_sents = tokenizer.convert_tokens_to_ids(tokenized_sents) abs_sents = tokenize(None, extracts) art_sents = tokenize(None, source_sents) return (art_sents, tokenized_sents, truncated_word_num), abs_sents art_batch, abs_batch = unzip(batch) art_batch, abs_batch = list( zip(*list(filter(is_good_data, zip(art_batch, abs_batch))))) art_sents, abs_sents = list( zip(*list(map(prepro(tokenizer), zip(art_batch, abs_batch))))) return art_sents, abs_sents
def fetch(pdu_type: PduType, df: Any, obj_type: Type[ObjectType], parameter: Optional[Text] = None, config: Optional[SnmpConfig] = None, **kwargs: Text) -> Tuple[Any, Sequence[SnmpError]]: """Fetch SNMP results and map to a DataFrame.""" def _fetch() -> Iterator[Tuple[Any, Sequence[SnmpError]]]: for hosts, data, index in distribute(df, None, **kwargs): results, errors = distributed_fetch(pdu_type, hosts, obj_type, parameter, config=config) yield obj_type.to_pandas(results, data, index), errors result_dfs, errors_lists = unzip(list(_fetch())) return (pd.concat(result_dfs), [error for errors in errors_lists for error in errors])
def coll_sent(tokenizer, batch): def is_good_data(d): """ make sure data is not empty""" source_sents, extracts = d return source_sents and extracts @curry def prepro(tokenizer, d, max_len=150, max_sent_len=60): """ make sure data is not empty""" source_sents, extracts = d tokenized_sents = [ tokenizer.tokenize(source_sent.lower()) for source_sent in source_sents ] tokenized_sents = tokenized_sents[:max_sent_len] tokenized_sents = [['[CLS]'] + tokenized_sent[:max_len - 1] for tokenized_sent in tokenized_sents] tokenized_sents = [ tokenizer.convert_tokens_to_ids(tokenized_sent) for tokenized_sent in tokenized_sents ] word_num = [ len(tokenized_sent) for tokenized_sent in tokenized_sents ] tokenized_sents = [ tokenizer.convert_tokens_to_ids(tokenized_sent) for tokenized_sent in tokenized_sents ] abs_sents = tokenize(None, extracts) art_sents = tokenize(None, source_sents) return (art_sents, tokenized_sents, word_num), abs_sents art_batch, abs_batch = unzip(batch) art_batch, abs_batch = list( zip(*list(filter(is_good_data, zip(art_batch, abs_batch))))) art_sents, abs_sents = list( zip(*list(map(prepro(tokenizer), zip(art_batch, abs_batch))))) return art_sents, abs_sents
def pad_collate(features): """ pad the input features to same length""" # need to sort by src lens (support RNN encoder) features = sorted(features, key=_feature_sort_key, reverse=True) (ids, src_ids, src_lens, tgt_ids, topk_logits, topk_inds) = map(list, unzip(features)) src_ids = pad_sequence(src_ids, batch_first=False, padding_value=PAD).unsqueeze(2) src_len = torch.tensor(src_lens) tgt_ids = pad_sequence(tgt_ids, batch_first=False, padding_value=PAD).unsqueeze(2) ids = torch.tensor(ids) # pad bert hiddens len_, batch, _ = tgt_ids.size() k = topk_logits[0].size(-1) topk_logit = torch.zeros( len_ - 1, # minus BOS batch, k, dtype=topk_logits[0].dtype) topk_index = torch.zeros( len_ - 1, # minus BOS batch, k, dtype=topk_inds[0].dtype) for i, (logit, index) in enumerate(zip(topk_logits, topk_inds)): topk_logit.data[:logit.size(0), i, :] = logit.data topk_index.data[:index.size(0), i, :] = index.data batch = InputFeatures(src=(src_ids, src_len), tgt=tgt_ids, indices=ids, batch_size=len(ids), topk_logit=topk_logit, topk_indices=topk_index) return batch
def paired_shuffle(iterable1, iterable2): """paired_shuffle.""" i1i2 = list(zip(iterable1, iterable2)) random.shuffle(i1i2) i1, i2 = unzip(i1i2) return list(i1), list(i2)
def paired_shuffle(iterable1, iterable2): """paired_shuffle.""" i1i2 = list(zip(iterable1, iterable2)) random.shuffle(i1i2) i1, i2 = unzip(i1i2) return list(i1), list(i2)
def coll(batch): art_batch, abs_batch, extracted = unzip(batch) art_sents = list(filter(bool, map(tokenize(None), art_batch))) abs_sents = list(filter(bool, map(tokenize(None), abs_batch))) extracted = list(filter(bool, extracted)) return art_sents, abs_sents, extracted
def coll(batch): art_batch, abs_batch, i_batch = unzip(batch) art_sents = list(filter(bool, map(tokenize(None), art_batch))) abs_sents = list(filter(bool, map(tokenize(None), abs_batch))) return art_sents, abs_sents, list(i_batch)
def coll(batch): split_token = '<split>' pad = 0 art_batch, abs_batch, all_clusters = unzip(batch) art_sents = [] abs_sents = [] def is_good_data(d): """ make sure data is not empty""" source_sents, extracts = d return source_sents and extracts art_batch, abs_batch = list( zip(*list(filter(is_good_data, zip(art_batch, abs_batch))))) art_sents = list(filter(bool, map(tokenize(None), art_batch))) abs_sents = list(filter(bool, map(tokenize(None), abs_batch))) inputs = [] # merge cluster for art_sent, clusters in zip(art_sents, all_clusters): cluster_words = [] cluster_wpos = [] cluster_spos = [] for cluster in clusters: scluster_word = [] scluster_wpos = [] scluster_spos = [] for mention in cluster: if len(mention['text'].strip().split(' ')) == len( list( range(mention['position'][3] + 1, mention['position'][4] + 1))): scluster_word += mention['text'].lower().strip().split( ' ') scluster_wpos += list( range(mention['position'][3] + 1, mention['position'][4] + 1)) scluster_spos += [ mention['position'][0] + 1 for _ in range( len(mention['text'].strip().split(' '))) ] scluster_word.append(split_token) scluster_wpos.append(pad) scluster_spos.append(pad) else: sent_num = mention['position'][0] word_start = mention['position'][3] word_end = mention['position'][4] # if word_end > 99: # word_end = 99 if sent_num > len(art_sent) - 1: print('bad cluster') continue scluster_word += art_sent[sent_num][ word_start:word_end] scluster_wpos += list(range(word_start, word_end)) scluster_spos += [ mention['position'][0] + 1 for _ in range(word_start + 1, word_end + 1) ] scluster_word.append(split_token) scluster_wpos.append(pad) scluster_spos.append(pad) if scluster_word != []: scluster_word.pop() scluster_wpos.pop() scluster_spos.pop() cluster_words.append(scluster_word) cluster_wpos.append(scluster_wpos) cluster_spos.append(scluster_spos) if len(scluster_word) != len(scluster_wpos): print(scluster_word) print(scluster_wpos) print('cluster:', cluster) if len(scluster_word) != len(scluster_spos): print(scluster_word) print(scluster_spos) print('cluster:', cluster) assert len(scluster_word) == len(scluster_spos) and len( scluster_spos) == len(scluster_wpos) new_clusters = (cluster_words, cluster_wpos, cluster_spos) inputs.append((art_sent, new_clusters)) assert len(inputs) == len(abs_sents) return inputs, abs_sents
def coll(batch): art_batch, topics, abs_batch = unzip(batch) #art_batch, topics, abs_batch, topic_label = unzip(batch) art_sents = list(filter(bool, map(tokenize(None), art_batch))) abs_sents = list(filter(bool, map(tokenize(None), abs_batch))) return art_sents, topics, abs_sents
def coll(batch): articles, abstract, extracted = unzip(batch) articles = list(filter(bool, articles)) abstract = list(filter(bool, abstract)) extracted = list(filter(bool, extracted)) return articles, abstract, extracted
def run(*options, cfg=None, debug=False): """Run testing of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Setup Augmentations test_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=cv2.BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=cv2.BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), ] ) PenobscotDataset = get_patch_dataset(config) test_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="test", transforms=test_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.TEST.COMPLETE_PATCHES_ONLY, ) logger.info(str(test_set)) n_classes = test_set.n_classes test_loader = data.DataLoader( test_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, ) model = getattr(models, config.MODEL.NAME).get_seg_model(config) logger.info(f"Loading model {config.TEST.MODEL_PATH}") model.load_state_dict(torch.load(config.TEST.MODEL_PATH), strict=False) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.to(device) # Send to GPU try: output_dir = generate_path(config.OUTPUT_DIR, git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),) except TypeError: output_dir = generate_path(config.OUTPUT_DIR, config.MODEL.NAME, current_datetime(),) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean") def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) def _select_all(model_out_dict): return ( model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze(), model_out_dict["ids"], model_out_dict["patch_locations"], ) inline_mean_iou = InlineMeanIoU( config.DATASET.INLINE_HEIGHT, config.DATASET.INLINE_WIDTH, config.TRAIN.PATCH_SIZE, n_classes, padding=_padding_from(config), scale=_scale_from(config), output_transform=_select_all, ) evaluator = create_supervised_evaluator( model, _prepare_batch, metrics={ "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device), "inIoU": inline_mean_iou, "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Test results", metrics_dict={ "nll": "Avg loss :", "mIoU": "Avg IoU :", "pixa": "Pixelwise Accuracy :", "mca": "Mean Class Accuracy :", "inIoU": "Mean Inline IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, evaluator, "epoch", metrics_dict={"mIoU": "Test/IoU", "nll": "Test/Loss", "mca": "Test/MCA", "inIoU": "Test/MeanInlineIoU",}, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose( np_to_tb, decode_segmap(n_classes=n_classes, label_colours=_SEG_COLOURS), _tensor_to_numpy, ) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Pred", "y_pred", transform_func=transform_pred), ) logger.info("Starting training") if debug: logger.info("Running in Debug/Test mode") test_loader = take(3, test_loader) evaluator.run(test_loader, max_epochs=1) # Log top N and bottom N inlines in terms of IoU to tensorboard inline_ious = inline_mean_iou.iou_per_inline() sorted_ious = sorted(inline_ious.items(), key=lambda x: x[1], reverse=True) topk = ((inline_mean_iou.predictions[key], inline_mean_iou.masks[key]) for key, iou in take(_TOP_K, sorted_ious)) bottomk = ( (inline_mean_iou.predictions[key], inline_mean_iou.masks[key]) for key, iou in tail(_BOTTOM_K, sorted_ious) ) stack_and_decode = compose(transform_func, torch.stack) predictions, masks = unzip(chain(topk, bottomk)) predictions_tensor = stack_and_decode(list(predictions)) masks_tensor = stack_and_decode(list(masks)) _log_tensor_to_tensorboard(predictions_tensor, "Test/InlinePredictions", summary_writer, evaluator) _log_tensor_to_tensorboard(masks_tensor, "Test/InlineMasks", summary_writer, evaluator) summary_writer.close()
len(dataset) + 1, 100)) data_p0_7 = map(lambda n: (n, precision_at(take(n, dataset), 0.7)), range(100, len(dataset) + 1, 100)) data_p0_8 = map(lambda n: (n, precision_at(take(n, dataset), 0.8)), range(100, len(dataset) + 1, 100)) data_p0_85 = map(lambda n: (n, precision_at(take(n, dataset), 0.85)), range(100, len(dataset) + 1, 100)) # Data for plotting n_p0_6, p_p0_6 = unzip(data_p0_6) n_p0_7, p_p0_7 = unzip(data_p0_7) n_p0_8, p_p0_8 = unzip(data_p0_8) n_p0_85, p_p0_85 = unzip(data_p0_85) # Note that using plt.subplots below is equivalent to using # fig = plt.figure and then ax = fig.add_subplot(111) fig, ax = plt.subplots() ax.plot(list(n_p0_6), list(p_p0_6), label='p >= 0.6') ax.plot(list(n_p0_7), list(p_p0_7), label='p >= 0.7') ax.plot(list(n_p0_8), list(p_p0_8), label='p >= 0.8') ax.plot(list(n_p0_85), list(p_p0_85), label='p >= 0.85') ax.set(xlabel='population size', ylabel='precision', title='precision / population size')