예제 #1
0
 def save(self, file_path):
     '''
     保存vocab
     :param file_name:
     :param pickle_path:
     :return:
     '''
     mappings = {"word2idx": self.word2idx, 'idx2word': self.idx2word}
     save_pickle(data=mappings, file_path=file_path)
예제 #2
0
def gen_candidate_database():
    from imagefeature import ImageFeature
    print("Candidate Matching Database Generation Start")

    common.prepare_clean_dir(Path("temp/"))

    IF = ImageFeature()

    query_features, query_pathes, orbs = [], [], []
    for img_file in sorted(Path("input/query/").glob("*")):
        query_pathes.append(img_file)
        x = IF.get_feature(img_file)
        query_features.append(x)
        print("Extracting Query Feature", img_file)

    target_class_names, target_features, target_pathes = [], [], []
    for folder in sorted(Path("input/target/").glob("*")):
        class_name = folder.stem
        for img_file in sorted(Path("input/target/%s/" % class_name).glob("*")):
            target_pathes.append(img_file)
            target_class_names.append(class_name)
            feature = IF.get_feature(img_file)
            target_features.append(feature)
            print("Extracting Target Feature", img_file)

    print("Calculating Similarities...")
    sims = cosine_similarity(query_features, target_features)

    candidate_matching_database = {}

    for query_index, row in enumerate(sims):

        query_file = query_pathes[query_index]
        candidate_matching_database[query_file] = {}

        args = np.argsort(row)
        args = args[::-1]

        for arg in args:
            target_path = target_pathes[arg]
            target_class_name = target_class_names[arg]

            if target_class_name not in candidate_matching_database[query_file]:
                candidate_matching_database[query_file][target_class_name] = []

            if len(candidate_matching_database[query_file][target_class_name]) < setting.MAX_NUMBER_ONE_CLASS:
                candidate_matching_database[query_file][target_class_name].append((target_path, row[arg]))

    common.save_pickle(Path("temp/candidate_matching_database.pickle"), candidate_matching_database)

    print("Candidate Matching Database Generation Finish")
예제 #3
0
def analyze(n_preview=10):
    global vectorizer, km
    # Encode:
    logger.info('Encoding...')
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=common.n_features,
                                 min_df=2,
                                 stop_words='english')
    common.X = vectorizer.fit_transform(common.doc_texts)
    common.save_pickle(vectorizer, 'vectorizer.pickle')
    common.vocab = np.array(vectorizer.get_feature_names())
    logger.info(f'X: {common.X.shape}')
    common.save_encoded_vocab()

    logger.info('Clustering...')
    # km = MiniBatchKMeans(n_clusters=common.n_topics, init=init_centroids(), init_size=1000, batch_size=1000,
    #                      verbose=0, random_state=common.random_seed)
    # km = MiniBatchKMeans(n_clusters=common.n_topics, verbose=1, random_state=1)
    km = KMeans(n_clusters=common.n_topics,
                init=init_centroids(),
                max_iter=3,
                verbose=1,
                random_state=2)

    # Analyze:
    common.doc_topics = km.fit_transform(common.X)  # the smaller, the closer
    common.doc_topics_reduced = np.argmin(common.doc_topics, axis=1)
    common.topics = km.cluster_centers_
    common.save_pickle(km, 'km.pickle')
    logger.info(f'doc_topics: {common.doc_topics.shape}')
    logger.info(f'topics: {common.topics.shape}')
    print()

    print('----------------')
    for i, topic_dist in enumerate(common.topics):
        top_words = common.vocab[np.argsort(topic_dist)[-10:][::-1]]
        print(f"Topic {i}: {' '.join(top_words)}")

    print()
    print('----------------')

    for i in range(n_preview):
        print(
            f'Article {i} (topic: {common.doc_topics_reduced[i]}), {common.doc_titles[i]}'
        )

    print()
    common.save_analyze_result()
예제 #4
0
def load_and_cache_examples(args,processor, data_type='train'):
    # Load data features from cache or dataset file
    cached_examples_file = args.data_dir / 'cached_crf-{}_{}_{}'.format(
        data_type,
        args.arch,#结构
        str(args.task_name))
    if cached_examples_file.exists():
        logger.info("Loading features from cached file %s", cached_examples_file)
        examples = load_pickle(cached_examples_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if data_type == 'train':
            examples = processor.get_aug_examples(args.data_dir/'train_train.bin',args.aug_num,data_type)
        elif data_type == 'dev':
            examples = processor.get_aug_examples(args.data_dir/'train_dev.bin',args.aug_num,data_type)
        logger.info("Saving features into cached file %s", cached_examples_file)
        save_pickle(examples, str(cached_examples_file))
    return examples
예제 #5
0
def gen_bgpdump_pickle(infile,outfile,ipv6=False):
    """ Read Cisco show ip bgp output captured in a infile
    and generate outfile (pickle that contains list of tuples
    that parse_cisco_bgp_file returns).

    infile: in filename (prefferably full path to the BGP text file)
    outfile: out filename
    ipv6: IPv6 indicator (needed for prefix normalization)
    """

    if os.path.isfile(outfile):
        return common.load_pickle(outfile)
    
    o=list(parse_cisco_bgp_file(infile, ipv6))

    common.save_pickle(o, outfile)

    return o
예제 #6
0
def gen_bgpdump_pickle(infile, outfile, ipv6=False):
    """ Read Cisco show ip bgp output captured in a infile
    and generate outfile (pickle that contains list of tuples
    that parse_cisco_bgp_file returns).

    :param str infile: Input filename (prefferably full path to the BGP text file)
    :param str outfile: Output filename
    :param bool ipv6: IPv6 indicator (needed for prefix normalization)
    :returns: The parsed cisco bgp output either from pickle or from the primary source
    """

    if os.path.isfile(outfile):
        return common.load_pickle(outfile)

    o = list(parse_cisco_bgp_file(infile, ipv6))

    common.save_pickle(o, outfile)

    return o
예제 #7
0
파일: cisco.py 프로젝트: tmshlvck/bgpcrunch
def gen_bgpdump_pickle(infile,outfile,ipv6=False):
    """ Read Cisco show ip bgp output captured in a infile
    and generate outfile (pickle that contains list of tuples
    that parse_cisco_bgp_file returns).

    :param str infile: Input filename (prefferably full path to the BGP text file)
    :param str outfile: Output filename
    :param bool ipv6: IPv6 indicator (needed for prefix normalization)
    :returns: The parsed cisco bgp output either from pickle or from the primary source
    """

    if os.path.isfile(outfile):
        return common.load_pickle(outfile)
    
    o=list(parse_cisco_bgp_file(infile, ipv6))

    common.save_pickle(o, outfile)

    return o
예제 #8
0
    def get_categories(self, save_path="", cache=True):
        """
        Find categories and return a dictionary with each category information
        """
        if (cache is True) and (self.categories is not None):
            logging.info("cache categories dictionary...")
            return

        url = self.url["category"]
        page = common.get_page(url)
        common.sleep_random_between(1, 2)

        cat_container = page.find_all("div", class_="categories__container")

        cat = {}

        if len(cat_container) == 0:
            logging.info(
                "category container is empty, returning empty dictionary...")
            return cat

        for c in cat_container:
            name = c.h2.text
            link = c.h2.a["href"]
            cat_id = re.findall(self.patt["cat_id"], link)
            if len(cat_id) == 0:
                logging.info("could not find category id, passing...")
                continue
            cat_id = cat_id[0]

            sub = self.get_sub_cats(link)

            cat[cat_id] = {"name": name, "link": link, "sub": sub}

        if len(cat) != 0 and save_path != "":
            common.save_pickle(save_path, cat)

        self.categories = cat
예제 #9
0
def main(unused_argv):
    pp = pprint.PrettyPrinter(indent=2, compact=True)

    # Load training and eval data
    (train_x, train_y), (test_x, test_y) = common.load_original_mnist()

    print(train_x.shape, train_x.dtype, train_y.shape, train_y.dtype)
    print(test_x.shape, test_x.dtype, test_y.shape, test_y.dtype)

    def train_model(classifier, log_stats=True):
        start_time = time.time()

        # Train the model
        # profiler_hook = tf.train.ProfilerHook(save_steps=50, output_dir=MODEL_DIR + '/train')

        train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"x": train_x},
            y=train_y,
            batch_size=TRAINING_BATCH_SIZE,
            num_epochs=None,
            shuffle=True)
        classifier.train(
            input_fn=train_input_fn,
            steps=TRAINING_STEPS,
            # hooks=[profiler_hook]
        )
        duration = round(time.time() - start_time, 3)

        if log_stats:
            print("Training duration: " + common.duration_to_string(duration))

        return duration

    def eval_model(classifier, log_stats=True):
        start_time = time.time()

        tensors_to_log = {
            # "probabilities": "softmax_tensor",
            "pred": "diff"
        }
        logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                                  every_n_iter=1)

        eval_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"x": test_x},
            y=test_y,
            batch_size=EVAL_BATCH_SIZE,
            shuffle=False)
        result = classifier.evaluate(
            input_fn=eval_input_fn,
            steps=EVAL_STEPS,
            # hooks=[logging_hook]
        )
        duration = round(time.time() - start_time, 3)

        if log_stats:
            print("Training duration: " + common.duration_to_string(duration))
            print("Eval result:", result)

        return result, duration

    model_stats_map = {}
    for conf_name, config in model_configs.items():

        # if config["skip"]:
        #     continue

        print("RUN CONFIG: %s" % conf_name)
        model_dir = os.path.join(MODEL_DIR, conf_name)

        # common.clean_dir(model_dir)

        mnist_classifier = tf.estimator.Estimator(model_fn=model_fn,
                                                  model_dir=model_dir,
                                                  params=config)

        eval_results = []
        total_train_duration = 0
        total_eval_duration = 0
        for _ in range(TRAINING_EPOCHS):
            # train_duration = train_model(mnist_classifier)
            # total_train_duration += train_duration

            eval_result, eval_duration = eval_model(mnist_classifier)
            eval_results.append(eval_result)
            total_eval_duration += eval_duration

        final_result = common.get_final_eval_result(eval_results)

        print("Eval results:")
        pp.pprint(eval_results)
        model_stats_map[conf_name] = {
            "model_details": model_details,
            "final_result": final_result,
            "total_train_duration":
            common.duration_to_string(total_train_duration),
            "total_eval_duration":
            common.duration_to_string(total_eval_duration),
        }
        common.save_pickle(
            model_stats_map[conf_name],
            os.path.join(model_details["model_dir"], "last_result.pkl"))
        common.save_json(
            model_stats_map[conf_name],
            os.path.join(model_details["model_dir"], "last_result.json"))

        print("Total training duration: " +
              common.duration_to_string(total_train_duration))
        print("Total eval duration: " +
              common.duration_to_string(total_eval_duration))

    print("Models results:")
    pp.pprint(model_stats_map)
예제 #10
0
def save_records(X, X_filename):
    common.save_pickle(X_filename, X)