Ejemplo n.º 1
0
def evaluate(config: Config) -> List[str]:
    data = load_pickle(file_path=config.data_path)
    results = _load_results(result_paths=config.result_paths)

    print('Assemble explanations, model weights, ...!')
    scores = _generate_empty_evaluation_results_dict()

    scores[A.global_based][A.method_names] = _get_method_names(
        results=results, sample_based=False)
    scores[A.sample_based][A.method_names] = _get_method_names(
        results=results, sample_based=True)
    scores[A.data_weights] = _get_data_weights(result=load_pickle(
        file_path=config.result_paths[0]))
    scores[A.model_accuracies] = _assemble_model_accuracies(results=results,
                                                            data=data)
    scores[A.model_weights] = _assemble_model_weights(
        results=results, weights=scores[A.data_weights])
    g, s = _assemble_explanations2(results=results, scores=scores)
    scores[A.global_based][A.explanations] = g
    scores[A.sample_based][A.explanations] = s

    print('Calculate scores!')
    pattern_type = int(extract_pattern_type(data_path=config.data_path))
    scores[A.global_based]['roc_auc'] = _assemble_results_roc_analysis(
        explanations=scores[A.global_based][A.explanations],
        weights=scores[A.data_weights],
        pattern_type=pattern_type)
    scores[A.global_based][
        'precision_based_scores'] = _assemble_results_precision_analysis(
            explanations=scores[A.global_based][A.explanations],
            weights=scores[A.data_weights],
            pattern_type=pattern_type)
    scores[A.sample_based]['roc_auc'] = _assemble_results_roc_analysis(
        explanations=scores[A.sample_based][A.explanations],
        weights=scores[A.data_weights],
        pattern_type=pattern_type)
    scores[A.sample_based][
        'precision_based_scores'] = _assemble_results_precision_analysis(
            explanations=scores[A.sample_based][A.explanations],
            weights=scores[A.data_weights],
            pattern_type=pattern_type)

    print('Save results!')
    output_paths = list()
    date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    pattern_type = f'pattern_type_{extract_pattern_type(data_path=config.data_path)}'
    suffix = '_'.join(['evaluation', date, pattern_type])
    output_paths += [
        to_pickle(output_dir=config.output_dir_scores,
                  data=scores,
                  suffix=suffix)
    ]
    return output_paths
Ejemplo n.º 2
0
def main():
    fpath = '../data/data_vary_signal_exact_2021-04-29-14-44-03_pattern_type_6.pkl'
    data = load_pickle(file_path=fpath)
    idx_experiment = 22
    idx_sample = 107
    for weight, data_list in data.items():
        print(weight)
        d = data_list[idx_experiment]
        sample = data_list[idx_experiment]['val']['x'][idx_sample, :]
        model = LogisticRegression(penalty='none',
                                   fit_intercept=False,
                                   max_iter=10,
                                   random_state=123)
        model.fit(X=d['train']['x'], y=d['train']['y'].flatten())
        pred_train = model.predict(d['train']['x'])
        pred_val = model.predict(d['val']['x'])
        print(
            f"Accuracy train: {accuracy_score(y_true=d['train']['y'].flatten(), y_pred=pred_train)}"
        )
        print(
            f"Accuracy val: {accuracy_score(y_true=d['val']['y'].flatten(), y_pred=pred_val)}"
        )
        sns.heatmap(sample.reshape((8, 8)), center=0.0)
        plt.show()
        label = data_list[idx_experiment]['val']['y'][idx_sample]
        print(
            f'Weight: {weight} Prediction: {model.predict(sample.reshape((1, 64)))} Label: {label}'
        )
def main(input_path: str) -> None:
    config = Config.get(input_conf=load_json_file(file_path=input_path))
    data = load_pickle(file_path=config.data_path)
    np.random.seed(seed=config.seed)
    results = generate_empty_results_dict()
    results['method_names'] = config.method_names
    print(f'Input: {asdict(config)}')
    print('Run experiments!')
    for weights, data_list in data.items():
        results_per_weight = list()
        print(f'Run experiments for weights: {weights}')
        for data in tqdm(data_list):
            results_per_weight += [main_experiment(data=data, config=config)]

        results['results'][weights] = results_per_weight
        date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        suffix = '_'.join(
            ['results_agnostic_sample_based', date, f'_{weights}'])
        to_pickle(output_dir=config.output_dir,
                  data=results_per_weight,
                  suffix=suffix)

    date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    pattern_type = f'pattern_type_{extract_pattern_type(data_path=config.data_path)}'
    suffix = '_'.join(['results_agnostic_sample_based', date, pattern_type])
    to_pickle(output_dir=config.output_dir, data=results, suffix=suffix)
    def get_aug_examples(self, distance_path, aug_num, mode):
        new_examples = []
        old_examples = []
        old_train = self.get_train_examples()
        if mode == 'train':
            old_examples = old_train
        elif mode == 'dev':
            old_examples = self.get_dev_examples()
        elif mode == 'test':
            old_examples = self.get_test_examples()

        examples_train = load_pickle(distance_path)

        for i, ele in enumerate(old_examples):
            cur_train = {}
            cur_train['ori_sentence'] = ele
            cur_train['aux_sentences'] = []
            sort_list = examples_train[i]
            sort_id = 0
            sort_id_list = []
            while len(cur_train['aux_sentences']) < aug_num:
                sort_sentence_id = sort_list[sort_id]
                if old_train[sort_sentence_id]['id'] != old_examples[i][
                        'id']:  #不同句子
                    cur_train['aux_sentences'].append(
                        old_train[sort_sentence_id].copy())
                    sort_id_list.append(sort_list[sort_id])
                sort_id += 1
                if sort_id >= len(sort_list):
                    raise ValueError('Need more sentences id!')

            new_examples.append(cur_train)

        return new_examples
Ejemplo n.º 5
0
def module_process(ianadir, host, days, ipv6=False, bestonly=False):
        """
        Match BGP prefixes in IANA's directory and generate text
        outputs and stats that determine average active prefix counts
        and average de-aggregation for each RIR.

        :param IanaDirectory ianadir: IanaDirectory instance to match agains
        :param str host: Host to take BGP feeds from
        :param days: List of days to analyze
        :param bool ipv6: IPv6 flag
        :param bool bestonly: Take only best BGP paths into account
        """

        timeline=[]
        timelineavg=[]

        for t in days:
                rirpfxlens={}
                ifn = bgp.bgpdump_pickle(t, host, ipv6)
                if not ifn:
                        continue
                bgpdump=common.load_pickle(ifn)
                common.d("ianaspace.module_run: matching prefixes in a tree (%d)"%len(bgpdump))

                for pv in bgpdump:
                        if bestonly and not (pv[0] and '>' in pv[0]):
                                continue

                        net = ipaddr.IPNetwork(pv[1])
                        r=ianadir.resolve_network(net)
                        if not r:
                                common.w("No IANA assignment for", str(pv[1]))
                                continue
                        name=r[2]
                        if r[1] == 'LEGACY' and not name in RIRS:
                                name='LEGACY'
                        if not name in rirpfxlens:
                                rirpfxlens[name]=[]
                        rirpfxlens[name].append(net.prefixlen)
                timeline.append([str(t)]+[len(rirpfxlens[n]) for n in RIRS])
                timelineavg.append([str(t)]+[(reduce(lambda x, y: x + y, rirpfxlens[n])/
                                             float(len(rirpfxlens[n]))) for n in RIRS])

                outtxt = '%s/rirstats%d-%s.txt'%(common.resultdir(t), (6 if ipv6 else 4), host)
                common.d("Generating output RIR stats text "+outtxt)
                with open(outtxt,'w') as f:
                        for i,k in enumerate(RIRS):
                                f.write('%s: %d (avg pfxlen: %.2f)\n'%(str(k), timeline[-1][1+i],
                                                                       round(timelineavg[-1][1+i], 2)))

        if timeline:
                outgraph = '%s/rirpfxcount%d-%s'%(common.resultdir(), (6 if ipv6 else 4), host)
                common.d("Generating output RIR pfxcount graph with prefix "+outgraph)
                graph.gen_multilineplot(timeline, outgraph, legend=RIRS, ylabel='Pfx count')

        if timelineavg:
                outgraph = '%s/rirpfxlen%d-%s'%(common.resultdir(), (6 if ipv6 else 4), host)
                common.d("Generating output RIR pfxlen graph with prefix "+outgraph)
                graph.gen_multilineplot(timelineavg, outgraph, legend=RIRS, ylabel='Avg pfx len')
Ejemplo n.º 6
0
def plot(config: Config, score_paths: List[str]) -> None:
    rnd_state = np.random.default_rng(config.seed)
    idx = rnd_state.integers(low=0, high=100)
    rnd_sample_idx = 107
    scores = load_pickle(file_path=score_paths[0])
    data = load_pickle(file_path=config.data_path)
    data_dict = get_randomized_heat_map_data(scores=scores, data=data, rnd_idx=idx)

    print(f'Create plots!')
    overview_correlation_plot(scores=scores, config=config)
    overall_accuracy_plot(scores=scores, config=config)

    print(f'Create rain cloud plots!')
    rain_clouds(scores=scores[A.sample_based], config=config, mode='sample_based',
                score_data_keys=[('roc_auc', 'auc'),
                                 # ('precision_based_scores', 'pr_auc'),
                                 ('precision_based_scores', 'max_precision'),
                                 ('precision_based_scores', 'avg_precision')])
    rain_clouds(scores=scores[A.global_based], config=config, mode='global',
                score_data_keys=[('roc_auc', 'auc'),
                                 # ('precision_based_scores', 'pr_auc'),
                                 ('precision_based_scores', 'max_precision'),
                                 ('precision_based_scores', 'avg_precision')])
    print(f'Create box plots!')
    box_plot(scores=scores[A.sample_based], config=config, mode='sample_based',
             snrs_of_interest=['0.00', '0.04', '0.08'],
             score_data_keys=[('roc_auc', 'auc'),
                              # ('precision_based_scores', 'pr_auc'),
                              ('precision_based_scores', 'max_precision'),
                              # ('precision_based_scores', 'avg_precision')
                              ])
    box_plot(scores=scores[A.global_based], config=config, mode='global',
             snrs_of_interest=['0.00', '0.04', '0.08'],
             score_data_keys=[('roc_auc', 'auc'),
                              # ('precision_based_scores', 'pr_auc'),
                              ('precision_based_scores', 'max_precision'),
                              # ('precision_based_scores', 'avg_precision')
                              ])

    print(f'Create heat maps!')
    pattern_type = int(extract_pattern_type(data_path=config.data_path))
    global_heat_maps(scores=scores, config=config, rnd_experiment_idx=idx,
                     pattern_type=pattern_type, snrs_of_interest=['0.00', '0.04', '0.08'])
    sample_based_heat_maps(scores=scores, config=config, data=data_dict,
                           rnd_sample_idx=rnd_sample_idx, pattern_type=pattern_type,
                           snrs_of_interest=['0.00', '0.04', '0.08'])
Ejemplo n.º 7
0
 def load_from_file(self, file_path):
     '''
     从文件组红加载vocab
     :param file_name:
     :param pickle_path:
     :return:
     '''
     mappings = load_pickle(input_file=file_path)
     self.idx2word = mappings['idx2word']
     self.word2idx = mappings['word2idx']
Ejemplo n.º 8
0
def main():
    # fpath = '../data/data_vary_signal_exact_2021-01-18-16-07-37.pkl'
    fpath = '../data/data_vary_signal_exact_2021-02-23-12-45-08.pkl'
    # fpath = '../data/data_vary_signal_exact_2021-02-01-11-36-15.pkl'
    data = load_pickle(file_path=fpath)
    idx_experiment = 22
    new_data = dict()
    for weight, data_list in data.items():
        new_data['w' + 'd'.join(weight.split('.'))] = data_list[idx_experiment]
    savemat(file_name='data_vary_signal_exact_2021-02-23-12-45-08.mat',
            mdict=new_data)
Ejemplo n.º 9
0
def load_and_cache_examples(args,processor, data_type='train'):
    # Load data features from cache or dataset file
    cached_examples_file = args.data_dir / 'cached_crf-{}_{}_{}'.format(
        data_type,
        args.arch,#结构
        str(args.task_name))
    if cached_examples_file.exists():
        logger.info("Loading features from cached file %s", cached_examples_file)
        examples = load_pickle(cached_examples_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if data_type == 'train':
            examples = processor.get_aug_examples(args.data_dir/'train_train.bin',args.aug_num,data_type)
        elif data_type == 'dev':
            examples = processor.get_aug_examples(args.data_dir/'train_dev.bin',args.aug_num,data_type)
        logger.info("Saving features into cached file %s", cached_examples_file)
        save_pickle(examples, str(cached_examples_file))
    return examples
Ejemplo n.º 10
0
def gen_bgpdump_pickle(infile,outfile,ipv6=False):
    """ Read Cisco show ip bgp output captured in a infile
    and generate outfile (pickle that contains list of tuples
    that parse_cisco_bgp_file returns).

    infile: in filename (prefferably full path to the BGP text file)
    outfile: out filename
    ipv6: IPv6 indicator (needed for prefix normalization)
    """

    if os.path.isfile(outfile):
        return common.load_pickle(outfile)
    
    o=list(parse_cisco_bgp_file(infile, ipv6))

    common.save_pickle(o, outfile)

    return o
Ejemplo n.º 11
0
def create_path_matrix(host, days, ipv6=False):
    """ Generate matrix: [t:buckets,...] where buckets (r) contains
    r[16]=[x,y,z,...] ; x,y,z are ints. It means that there was
    prefixes with netmask /16. One with AS-path length x, another y, ...
    """
    bucket_matrix={}

    for t in days:
        bgpfile=bgpdump_pickle(t, host, ipv6)
        if not bgpfile:
            common.d("bgp.create_path_matrix skipping time "+str(t)+"...")
            continue

        common.d("bgp.create_path_matrix processing time "+str(t)+"...")

        bgpdump=common.load_pickle(bgpfile)
        bucket_matrix[t]=gen_buckets(bgpdump, ipv6, bestonly=True)

    return bucket_matrix
Ejemplo n.º 12
0
def gen_bgpdump_pickle(infile,outfile,ipv6=False):
    """ Read Cisco show ip bgp output captured in a infile
    and generate outfile (pickle that contains list of tuples
    that parse_cisco_bgp_file returns).

    :param str infile: Input filename (prefferably full path to the BGP text file)
    :param str outfile: Output filename
    :param bool ipv6: IPv6 indicator (needed for prefix normalization)
    :returns: The parsed cisco bgp output either from pickle or from the primary source
    """

    if os.path.isfile(outfile):
        return common.load_pickle(outfile)
    
    o=list(parse_cisco_bgp_file(infile, ipv6))

    common.save_pickle(o, outfile)

    return o
Ejemplo n.º 13
0
def gen_bgpdump_pickle(infile, outfile, ipv6=False):
    """ Read Cisco show ip bgp output captured in a infile
    and generate outfile (pickle that contains list of tuples
    that parse_cisco_bgp_file returns).

    :param str infile: Input filename (prefferably full path to the BGP text file)
    :param str outfile: Output filename
    :param bool ipv6: IPv6 indicator (needed for prefix normalization)
    :returns: The parsed cisco bgp output either from pickle or from the primary source
    """

    if os.path.isfile(outfile):
        return common.load_pickle(outfile)

    o = list(parse_cisco_bgp_file(infile, ipv6))

    common.save_pickle(o, outfile)

    return o
Ejemplo n.º 14
0
def match():
    import deepmatching_wrapper as dm
    import cv2

    candidate_matching_database = common.load_pickle(Path("temp/candidate_matching_database.pickle"))

    common.prepare_clean_dir(Path("output/"))
    common.prepare_clean_dir(Path("output/images/"))

    output = {}
    for query_file, candidates in candidate_matching_database.items():
        query_name = Path(query_file).stem
        matching_result = []
        for target_class_name, target_images in candidates.items():
            for i, (target_path, similarity) in enumerate(target_images):
                print("Matching", query_file, "with target image", target_path)

                matches, name1, name2, qw, qh, tw, th, img1, img2 = dm.match(query_file, target_path)
                src_pts = np.float32([[m[0], m[1]] for m in matches])
                dst_pts = np.float32([[m[2], m[3]] for m in matches])

                i = 0
                inlier = []

                M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, setting.RANSAC_THRESHOLD)
                for index, m in enumerate(mask):
                    if np.isclose(m, 1):
                        i += 1
                        inlier.append(matches[index])

                output_name = "%s_%s_%02d.jpg" % (query_name, target_class_name, i)
                dm.draw(img1, img2, inlier, Path("output/images/") / output_name)

                matching_result.append({
                    "class_name": target_class_name,
                    "inlier": len(inlier)
                })
        output[query_file.name] = sorted(matching_result, key=lambda x: x["inlier"], reverse=True)

    common.write_json(Path("output/result.json"), output)
Ejemplo n.º 15
0
def overview_rain_cloud_plot(paths: List[str], config: Config,
                             score_data_key: str, metric_name: str):
    df = pd.DataFrame()
    for score_path in paths:
        scores = load_pickle(file_path=score_path)
        aux_df = create_rain_cloud_data(data=scores[score_data_key], metric_name=metric_name)
        aux_df = add_column_for_class_of_explanation_method(data=aux_df)
        df = df.append(aux_df)

    sigma = .5
    sns.set_theme('paper')
    sns.set(font_scale=1)
    with sns.axes_style("whitegrid"):
        g = sns.FacetGrid(df, row='class', col='SNR', height=6, ylim=(0, 1.05))
        g.map_dataframe(pt.RainCloud, x='Method', y=metric_name, data=df,
                        orient='v', bw=sigma, width_viol=.0)
        for ax in g.axes.flat:
            labels = ax.get_xticklabels()
            ax.set_xticklabels(labels, rotation=20)
        g.fig.subplots_adjust(bottom=0.15)

    file_name = '_'.join(['rain_cloud_plot', 'overview', metric_name, '.png'])
    output_path = join(config.output_dir_plots, file_name)
    save_figure(file_path=output_path, fig=g.fig, dpi=config.dpi)
Ejemplo n.º 16
0
def create_path_matrix(host, days, ipv6=False):
    """ Generate matrix: [t:buckets,...] where buckets (r) contains
    r[16]=[x,y,z,...] ; x,y,z are ints. It means that there was
    prefixes with netmask /16. One with AS-path length x, another y, ...

    :param str host: Host name to analyze
    :param days: List of Day obj. to analyze
    :param bool ipv6: IPv6 flag
    :returns: Bucket matrix
    """
    bucket_matrix={}

    for t in days:
        bgpfile=bgpdump_pickle(t, host, ipv6)
        if not bgpfile:
            common.d("bgp.create_path_matrix skipping time "+str(t)+"...")
            continue

        common.d("bgp.create_path_matrix processing time "+str(t)+"...")

        bgpdump=common.load_pickle(bgpfile)
        bucket_matrix[t]=gen_buckets(bgpdump, ipv6, bestonly=True)

    return bucket_matrix
Ejemplo n.º 17
0
 def load_categories(self, fpath):
     logging.info("Loading categories")
     self.categories = common.load_pickle(fpath)
Ejemplo n.º 18
0
def load_x(filename):
    return common.load_pickle(filename)
Ejemplo n.º 19
0
def init():
    global vectorizer, km
    vectorizer = common.load_pickle('vectorizer.pickle')
    km = common.load_pickle('km.pickle')
    logger.info('Initialized')
Ejemplo n.º 20
0
def predict(args, processor):
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    config = config_model(args)
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, NERModel, args.output_dir, config, logger)
        test_data = []
        with open(str(args.data_dir / "test.json"), 'r') as f:
            idx = 0
            for line in f:
                tokens = []
                json_d = {}
                line = json.loads(line.strip())
                textlist = list(line['text'])
                for i, word in enumerate(textlist):
                    token = tokenizer.tokenize(word)
                    assert len(token) == 1
                    tokens.extend(token)
                assert len(tokens) < args.max_seq_len
                ntokens = []
                segment_ids = []
                label_ids = []
                ntokens.append("[CLS]")  # 句子开始设置CLS 标志
                segment_ids.append(0)
                for i, token in enumerate(tokens):
                    ntokens.append(token)
                    segment_ids.append(0)
                ntokens.append("[SEP]")
                segment_ids.append(0)
                # append("O") or append("[SEP]") not sure!
                input_ids = tokenizer.convert_tokens_to_ids(ntokens)
                input_len = len(input_ids)
                input_mask = [1] * len(input_ids)
                while len(input_ids) < args.max_seq_len:
                    input_ids.append(0)
                    input_mask.append(0)
                    segment_ids.append(0)
                raw_text = []
                raw_text.append('[CLS]')
                raw_text.extend(textlist)
                raw_text.append('[SEP]')
                assert len(raw_text) == len(ntokens)
                assert len(input_ids) == args.max_seq_len
                assert len(input_mask) == args.max_seq_len
                assert len(segment_ids) == args.max_seq_len

                json_d['id'] = idx
                json_d['input_ids'] = input_ids
                json_d['input_mask'] = input_mask
                json_d['segment_ids'] = segment_ids
                json_d['input_len'] = input_len
                json_d['text'] = raw_text
                idx += 1
                test_data.append(json_d)
        results = []
        train_data = processor.get_train_examples()
        test_train = load_pickle(args.data_dir / 'train_test.bin')
        for step, line in enumerate(test_data):
            a_input_ids = []
            a_input_mask = []
            a_label_ids = []
            a_input_lens = []
            a_segment_ids = []
            aux_sentence = [
                train_data[i] for i in test_train[step][:args.aug_num]
            ]
            for s in aux_sentence:
                a_input_ids.append(s['input_ids'])
                #                 a_label_ids.append(s['label_ids'])
                #地址信息增强,将所有的标签信息改成adress标签,全1
                a_label_ids.append(s['input_mask'])
                a_input_mask.append(s['input_mask'])
                a_input_lens.append(s['input_len'])
                a_segment_ids.append(s['segment_ids'])
            input_ids = line['input_ids']
            input_mask = line['input_mask']
            input_lens = line['input_len']
            segment_ids = line['segment_ids']
            batch = {
                'ori':
                ([input_ids], [input_mask], [[]], [input_lens], [segment_ids]),
                'aug': ([a_input_ids], [a_input_mask], [a_label_ids],
                        [a_input_lens], [a_segment_ids])
            }
            tags = model.evaluate_line(sess, batch)
            label_entities = get_entities(tags[0], args.id2label)
            json_d = {}
            json_d['id'] = step
            tags[0] = [args.id2label[idx] for idx in tags[0]]
            json_d['tag_seq'] = " ".join(tags[0])
            json_d['entities'] = label_entities
            results.append(json_d)
        print(" ")
        output_predic_file = str(args.output_dir / "test_prediction.json")
        output_submit_file = str(args.output_dir / "cluener_submit.json")
        with open(output_predic_file, "w") as writer:
            for record in results:
                writer.write(json.dumps(record) + '\n')
        test_text = []

        test_submit = []
        for x, y in zip(test_data, results):
            json_d = {}
            json_d['id'] = x['id']
            json_d['label'] = {}
            entities = y['entities']
            #加了标记
            words = x['text']
            if len(entities) != 0:
                for subject in entities:
                    tag = subject[0]
                    start = subject[1]
                    end = subject[2]
                    word = "".join(words[start:end + 1])
                    if tag in json_d['label']:
                        if word in json_d['label'][tag]:
                            json_d['label'][tag][word].append([start, end])
                        else:
                            json_d['label'][tag][word] = [[start, end]]
                    else:
                        json_d['label'][tag] = {}
                        json_d['label'][tag][word] = [[start, end]]
            test_submit.append(json_d)
        json_to_text(output_submit_file, test_submit)
Ejemplo n.º 21
0
def _load_results(result_paths: List[str]) -> List:
    output = list()
    for p in result_paths:
        output += [load_pickle(file_path=p)]
    return output