Example #1
0
def eval_online(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    cluster_dir = os.path.split(options.cluster_path)[0]
    if not utils.check_rebuild(cluster_dir,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))
    logger.info('\t total labels size: {}'.format(options.label_size))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

    fr_total = open(options.cluster_path, 'w')
    fr_total.write('eval case: cluster...\n')
    fr_total.write('\t save_dir: {}\n'.format(cluster_dir))
    fr_total.write('\t cluster: kmeans\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times\n'.format(options.repeated_times))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(NMI):\n=============================================================\n'
    )
    fr_total.write('finish_time\tckpt\tNMI\n')

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(cluster_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='nmi', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_nmi = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_cluster"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        LABEL_SIZE = options.label_size
        logger.info(
            '\t reading labeled data completed in {}s'.format(time.time() -
                                                              time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # cluster
        fr = open(options.cluster_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: cluster...\n')
        fr.write('\t cluster: kmeans\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times\n'.format(options.repeated_times))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i,
                                                  np.sum(labels_matrix == i)))

        if options.eval_workers > 1 and options.repeated_times > 1:
            # speed up by using multi-process
            fr.write("\t using {} processes for evaling:\n".format(
                len(times_per_worker)))
            for idx, rep_times in enumerate(times_per_worker):
                fr.write("\t process-{}: repeat {} times\n".format(
                    idx, rep_times))

            try:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            except:
                nmi_list = []
                with ProcessPoolExecutor(
                        max_workers=options.eval_workers) as executor:
                    for ret in executor.map(_cluster_thread_body,
                                            times_per_worker):
                        nmi_list.extend(ret)
            if len(nmi_list) != options.repeated_times:
                logger.warning(
                    "warning: eval unmatched repeated_times: {} != {}".format(
                        len(nmi_list), options.repeated_times))
        else:
            try:
                nmi_list = _cluster_thread_body(options.repeated_times)
            except:
                nmi_list = _cluster_thread_body(options.repeated_times)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        mean_nmi = sum(nmi_list) / float(len(nmi_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(options.repeated_times, len(nmi_list)))
        fr.write('\t\t NMI = {}\n'.format(mean_nmi))
        fr.write('details:\n')
        for repeat in range(len(nmi_list)):
            fr.write('\t repeated {}/{}: NMI = {}\n'.format(
                repeat + 1, len(nmi_list), nmi_list[repeat]))
        fr.write('\neval case: cluster completed in {}s\n'.format(time.time() -
                                                                  time_start))
        fr.close()

        # fr_total.write('%.4f\n' % mean_nmi)
        fr_total.write('{}\n'.format(mean_nmi))
        fr_total.flush()
        summary.value.add(tag='nmi', simple_value=mean_nmi)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'cluster completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_nmi > best_nmi:
            best_nmi = mean_nmi

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(cluster_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_nmi: {}\n".format(best_nmi))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    cluster_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(cluster_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()

    return
Example #2
0
def eval_once(options):
    global features_matrix, labels_matrix, LABEL_SIZE
    if not utils.check_rebuild(options.cluster_path,
                               descrip='cluster',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: cluster...')
    logger.info('\t save_path: {}'.format(options.cluster_path))
    logger.info('\t cluster: kmeans')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t repeat {} times'.format(options.repeated_times))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(
        options.label_path, multilabel_rule=options.multilabel_rule)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    LABEL_SIZE = options.label_size
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # cluster
    fr = open(options.cluster_path, 'w')
    fr.write('eval case: cluster...\n')
    fr.write('\t save_path: {}\n'.format(options.cluster_path))
    fr.write('\t cluster: kmeans\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times\n'.format(options.repeated_times))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

    if options.eval_workers > 1 and options.repeated_times > 1:
        # speed up by using multi-process
        logger.info("\t allocating repeat_times to workers ...")
        if options.repeated_times <= options.eval_workers:
            times_per_worker = [1 for _ in range(options.repeated_times)]
        else:
            div, mod = divmod(options.repeated_times, options.eval_workers)
            times_per_worker = [div for _ in range(options.eval_workers)]
            for idx in range(mod):
                times_per_worker[idx] = times_per_worker[idx] + 1
        assert sum(
            times_per_worker
        ) == options.repeated_times, 'workers allocating failed: %d != %d' % (
            sum(times_per_worker), options.repeated_times)

        logger.info("\t using {} processes for evaling:".format(
            len(times_per_worker)))
        for idx, rep_times in enumerate(times_per_worker):
            logger.info("\t process-{}: repeat {} times".format(
                idx, rep_times))

        try:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)
        except:
            nmi_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_cluster_thread_body,
                                        times_per_worker):
                    nmi_list.extend(ret)

        if len(nmi_list) != options.repeated_times:
            logger.warning(
                "warning: eval unmatched repeated_times: {} != {}".format(
                    len(nmi_list), options.repeated_times))
    else:
        try:
            nmi_list = _cluster_thread_body(options.repeated_times)
        except:
            nmi_list = _cluster_thread_body(options.repeated_times)

    mean_nmi = sum(nmi_list) / float(len(nmi_list))
    fr.write(
        'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
        .format(options.repeated_times, len(nmi_list)))
    fr.write('\t\t NMI = {}\n'.format(mean_nmi))
    fr.write('details:\n')
    for repeat in range(len(nmi_list)):
        fr.write('\t repeated {}/{}: NMI = {}\n'.format(
            repeat + 1, len(nmi_list), nmi_list[repeat]))
    fr.write('\neval case: cluster completed in {}s.'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: cluster completed in {}s.'.format(time.time() -
                                                              time_start))

    return
Example #3
0
def eval_once(options):
    global features_matrix, labels_matrix
    if not utils.check_rebuild(options.classify_path,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    logger.info('eval case: classify...')
    logger.info('\t save_path: {}'.format(options.classify_path))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))

    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path)
    features_matrix, labels_list = utils.get_vectors(
        utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    mlb = MultiLabelBinarizer(range(options.label_size))
    labels_matrix = mlb.fit_transform(labels_list)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))
    logger.info('\t total labeled data size: {}'.format(
        np.size(features_matrix, axis=0)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]

    # classify
    fr = open(options.classify_path, 'w')
    fr.write('eval case: classify...\n')
    fr.write('\t save_path: {}\n'.format(options.classify_path))
    fr.write('\t classifier: LogisticRegression\n')
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr.write('\t total labeled data size: {}\n'.format(
        np.size(features_matrix, axis=0)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:, i])))

    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))
        ret_list = []  # (train_ratio, macro, micro)
        with ProcessPoolExecutor(max_workers=options.eval_workers) as executor:
            for ret in executor.map(_classify_thread_body,
                                    train_ratios_per_worker):
                ret_list.extend(ret)
    else:
        ret_list = _classify_thread_body(train_ratio_fulllist)

    ret_dict = {}
    for ret in ret_list:
        if ret[0] in ret_dict:
            ret_dict[ret[0]][0].append(ret[1])
            ret_dict[ret[0]][1].append(ret[2])
        else:
            ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

    for train_ratio, macro_micro in sorted(ret_dict.items(),
                                           key=lambda item: item[0]):
        fr.write('\n' + '-' * 20 + '\n' +
                 'train_ratio = {}\n'.format(train_ratio))
        Macro_F1_list = macro_micro[0]
        Micro_F1_list = macro_micro[1]
        if len(Macro_F1_list) != repeated_times:
            logger.warning(
                "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                .format(train_ratio, len(Macro_F1_list), repeated_times))
        mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
        mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
        fr.write(
            'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
            .format(repeated_times, len(Macro_F1_list)))
        fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
        fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
        fr.write('details:\n')
        for repeat in range(len(Macro_F1_list)):
            fr.write(
                '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                    repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                    Micro_F1_list[repeat]))
    fr.write('\neval case: classify completed in {}s'.format(time.time() -
                                                             time_start))
    fr.close()
    logger.info('eval case: classify completed in {}s'.format(time.time() -
                                                              time_start))
Example #4
0
def eval_online(options):
    global features_matrix, labels_matrix
    classify_dir = os.path.split(options.classify_path)[0]
    if not utils.check_rebuild(classify_dir,
                               descrip='classify',
                               always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(classify_dir):
        os.makedirs(classify_dir)
    logger.info('eval case: classify...')
    logger.info('\t save_dir: {}'.format(classify_dir))
    logger.info('\t classifier: LogisticRegression')
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))
    logger.info('\t eval_workers: {}'.format(options.eval_workers))
    logger.info('\t total labels size: {}'.format(options.label_size))

    # repeated 10times
    repeated_times = options.repeated_times
    # split ratio
    if options.train_ratio > 0:
        train_ratio_list = [options.train_ratio]
    else:
        train_ratio_list = [v / 10.0 for v in range(9, 0, -1)]

    logger.info('\t repeat {} times for each train_ratio in {}'.format(
        repeated_times, train_ratio_list))

    train_ratio_fulllist = [
        train_ratio for train_ratio in train_ratio_list
        for _ in range(repeated_times)
    ]
    if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
        # speed up by using multi-process
        if len(train_ratio_fulllist) <= options.eval_workers:
            train_ratios_per_worker = [[train_ratio]
                                       for train_ratio in train_ratio_fulllist]
        else:
            div, mod = divmod(len(train_ratio_fulllist), options.eval_workers)
            train_ratios_per_worker = [
                train_ratio_fulllist[div * i:div * (i + 1)]
                for i in range(options.eval_workers)
            ]
            for idx, train_ratio in enumerate(
                    train_ratio_fulllist[div * options.eval_workers:]):
                train_ratios_per_worker[len(train_ratios_per_worker) - 1 -
                                        idx].append(train_ratio)
        logger.info("\t using {} processes for evaling:".format(
            len(train_ratios_per_worker)))
        for idx, train_ratios in enumerate(train_ratios_per_worker):
            logger.info("\t process-{}: {}".format(idx, train_ratios))

    fr_total = open(options.classify_path, 'w')
    fr_total.write('eval case: classify...\n')
    fr_total.write('\t save_dir: {}\n'.format(classify_dir))
    fr_total.write('\t classifier: LogisticRegression\n')
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t eval_workers: {}\n'.format(options.eval_workers))
    fr_total.write('\t repeat {} times for each train_ratio in {}\n'.format(
        repeated_times, train_ratio_list))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write(
        '\t results(Macro_F1,Micro_F1):\n=============================================================\n'
    )
    fr_total.write(
        'finish_time\tckpt\t\t0.1\t0.2\t0.3\t0.4\t0.5\t0.6\t0.7\t0.8\t0.9\n')

    time_start = time.time()
    logger.info('\t reading labeled data from file {}'.format(
        options.label_path))
    id_list_totoal, labels_list_total = utils.get_labeled_data(
        options.label_path)
    logger.info('\t reading labeled data completed in {}s'.format(time.time() -
                                                                  time_start))

    last_step = 0
    summary_writer = tf.summary.FileWriter(classify_dir, tf.Graph())
    summary = tf.Summary()
    for train_ratio in train_ratio_list:
        summary.value.add(tag='macro_train_{}'.format(train_ratio),
                          simple_value=0.)
        summary.value.add(tag='micro_train_{}'.format(train_ratio),
                          simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_micro = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("\t model and vectors not exist, waiting ...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_classify"
    writing = options.vectors_path + ".writing"
    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            ## synchrolock for multi-process:
            # while(not(cur_step > last_step and os.path.exists(options.vectors_path) and
            #                       time.time() - os.stat(options.vectors_path).st_mtime > 200)):
            #     time.sleep(options.eval_interval)
            #     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            #     cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            # os.utime(options.vectors_path, None)
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(
                        os.path.join(
                            os.path.split(options.vectors_path)[0],
                            "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(
                    options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue

            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(
            options.vectors_path))
        features_matrix, labels_list = utils.get_vectors(
            utils.get_KeyedVectors(options.vectors_path), id_list_totoal,
            labels_list_total)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        mlb = MultiLabelBinarizer(range(options.label_size))
        labels_matrix = mlb.fit_transform(labels_list)
        logger.info('\t reading embedding vectors completed in {}s'.format(
            time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(
            np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))

        # classify
        fr = open(options.classify_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: classify...\n')
        fr.write('\t classifier: LogisticRegression\n')
        fr.write('\t eval_workers: {}\n'.format(options.eval_workers))
        fr.write('\t repeat {} times for each train_ratio in {}\n'.format(
            repeated_times, train_ratio_list))
        fr.write('\t total labeled data size: {}\n'.format(
            np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix[:,
                                                                          i])))

        if options.eval_workers > 1 and len(train_ratio_fulllist) > 1:
            fr.write("\t using {} processes for evaling:\n".format(
                len(train_ratios_per_worker)))
            for idx, train_ratios in enumerate(train_ratios_per_worker):
                fr.write("\t process-{}: {}\n".format(idx, train_ratios))
            ret_list = []  # (train_ratio, macro, micro)
            with ProcessPoolExecutor(
                    max_workers=options.eval_workers) as executor:
                for ret in executor.map(_classify_thread_body,
                                        train_ratios_per_worker):
                    ret_list.extend(ret)
        else:
            ret_list = _classify_thread_body(train_ratio_fulllist)

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        ret_dict = {}
        for ret in ret_list:
            if ret[0] in ret_dict:
                ret_dict[ret[0]][0].append(ret[1])
                ret_dict[ret[0]][1].append(ret[2])
            else:
                ret_dict[ret[0]] = [[ret[1]], [ret[2]]]

        for train_ratio, macro_micro in sorted(ret_dict.items(),
                                               key=lambda item: item[0]):
            fr.write('\n' + '-' * 20 + '\n' +
                     'train_ratio = {}\n'.format(train_ratio))
            Macro_F1_list = macro_micro[0]
            Micro_F1_list = macro_micro[1]
            if len(Macro_F1_list) != repeated_times:
                logger.warning(
                    "warning: train_ratio = {} eval unmatched repeated_times: {} != {}"
                    .format(train_ratio, len(Macro_F1_list), repeated_times))
            mean_Macro_F1 = sum(Macro_F1_list) / float(len(Macro_F1_list))
            mean_Micro_F1 = sum(Micro_F1_list) / float(len(Micro_F1_list))
            fr.write(
                'expected repeated_times: {}, actual repeated_times: {}, mean results as follows:\n'
                .format(repeated_times, len(Macro_F1_list)))
            fr.write('\t\t Macro_F1 = {}\n'.format(mean_Macro_F1))
            fr.write('\t\t Micro_F1 = {}\n'.format(mean_Micro_F1))
            fr.write('details:\n')
            for repeat in range(len(Macro_F1_list)):
                fr.write(
                    '\t repeated {}/{}: Macro_F1 = {}, Micro_F1 = {}\n'.format(
                        repeat + 1, len(Macro_F1_list), Macro_F1_list[repeat],
                        Micro_F1_list[repeat]))
            fr_total.write('%.4f, %.4f    ' % (mean_Macro_F1, mean_Micro_F1))
            summary.value.add(tag='macro_train_{}'.format(train_ratio),
                              simple_value=mean_Macro_F1)
            summary.value.add(tag='micro_train_{}'.format(train_ratio),
                              simple_value=mean_Micro_F1)

        fr.write(
            '\neval case: classify completed in {}s\n'.format(time.time() -
                                                              time_start))
        fr.close()
        fr_total.write('\n')
        fr_total.flush()
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info(
            'classify completed in {}s\n================================='.
            format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if mean_Micro_F1 > best_micro:
            best_micro = mean_Micro_F1

            ckptIsExists = os.path.exists(
                os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'w')
            else:
                fr_best = open(os.path.join(classify_dir, 'best_ckpt.info'),
                               'a')
                fr_best.write(
                    "Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                    "the current best_ckpt model is loss, but the result is:\n"
                )
            fr_best.write("best_micro(for ratio 0.9): {}\n".format(best_micro))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(
                    ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(
                    classify_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(classify_dir,
                                          'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir,
                                          'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(classify_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)

        last_step = cur_step

    fr_total.close()
    summary_writer.close()
Example #5
0
    def __init__(self,
                 net,
                 using_label=False,
                 feature_type="random",
                 shuffled=True,
                 label_path=None,
                 label_size=None,
                 feature_path=None,
                 feature_size=None):

        self._nodes_size = net.get_nodes_size()
        self._using_label = using_label
        self._feature_type = feature_type
        self._shuffled = shuffled
        self._label_size = label_size
        self._feature_size = feature_size

        # generate adjacency matrix
        adj_matrix = np.zeros((self._nodes_size, self._nodes_size),
                              dtype=np.int32)
        for x, y in net.edges:
            adj_matrix[x, y] = 1
            # adj_matrix[y, x] = 1
        # generate normalized laplacian matrix
        self._laplacian_matrix = self.preprocess_adj(adj_matrix)

        # generate labels (multi-hot encoding)
        if self._using_label:
            self._nodes_labels = np.zeros((self._nodes_size, self._label_size),
                                          dtype=np.int32)
            id_list, labels_list = utils.get_labeled_data(label_path)
            assert len(
                id_list
            ) == self._nodes_size, "error: not all nodes is labeled, %d != %d" % (
                len(id_list), self._nodes_size)
            for idx in range(len(id_list)):
                self._nodes_labels[id_list[idx], labels_list[idx]] = 1
        else:
            self._adj_matrix = adj_matrix  # adj_matrix will be the target label

        # generate features (additional attribute features is future work)
        if self._feature_type == "attribute":
            # for future work
            self._nodes_features = utils.get_features(feature_path)
            assert self._nodes_features.shape[
                0] == self._nodes_size, "error: %d != %d" % (
                    self._nodes_features.shape[0], self._nodes_size)
            assert self._nodes_features.shape[
                1] == self._feature_size, "error: %d != %d" % (
                    self._nodes_features.shape[1], self._feature_size)
        elif self._feature_type == "random":
            self._nodes_features = np.random.uniform(
                size=[self._nodes_size, self._feature_size])
        elif self._feature_type == "degree":
            assert self._feature_size == 1, "error: %d != 1" % self._feature_size
            self._nodes_features = np.zeros(
                (self._nodes_size, self._feature_size), dtype=np.float32)
            for idx in range(self._nodes_size):
                self._nodes_features[idx][0] = net.get_degrees(idx)
        elif self._feature_type == "adjacency":
            assert self._feature_size == self._nodes_size, "error: %d != %d" % (
                self._feature_size, self._nodes_size)
            self._nodes_features = adj_matrix
        else:
            logger.error("error! invalid feature_type: {}".format(
                self._feature_type))

        self._nodes_order = np.arange(self._nodes_size)
        if self._shuffled:
            np.random.shuffle(self._nodes_order)
        self._epochs_completed = 0
        self._index_in_epoch = 0
Example #6
0
def eval_once(options):
    # visual_dir, visual_file = os.path.split(options.visualization_path)
    if not utils.check_rebuild(options.visualization_path, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_path: {}\n'.format(options.visualization_path))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))


    # get embedding vectors and markersize
    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    time_start = time.time()
    id_list, labels_list = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                  multilabel_rule=options.multilabel_rule,
                                                  type_filepath=os.path.join(options.data_dir,
                                                                             options.data_name + ".nodes"))
    id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path), id_list, labels_list)
    labels_matrix = np.array([item[0] for item in labels_list])
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
    logger.info('\t total labeled data size: {}'.format(np.size(features_matrix,axis=0)))
    logger.info('\t the labels data embedding_dimension: {}'.format(np.size(features_matrix,axis=1)))
    logger.info('\t total labels size: {}'.format(options.label_size))
    for i in range(options.label_size):
        logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

    fr = open(options.visualization_path, 'w')
    fr.write('eval case: visualization...\n')
    fr.write('\t data_dir = {}\n'.format(options.data_dir))
    fr.write('\t data_name = {}\n'.format(options.data_name))
    fr.write('\t isdirected = {}\n'.format(options.isdirected))
    fr.write('\t label_path = {}\n'.format(options.label_path))
    fr.write('\t label_size = {}\n'.format(options.label_size))
    fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr.write('\t save_path: {}\n\n'.format(options.visualization_path))
    fr.write('\t method: t-SNE\n')
    fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr.write('\t marker_size: {}\n'.format(options.marker_size))
    fr.write('\t eval_online: {}\n'.format(options.eval_online))
    fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
    fr.write('\t the labels data embedding_dimension: {}\n'.format(np.size(features_matrix, axis=1)))
    fr.write('\t total labels size: {}\n'.format(options.label_size))
    for i in range(options.label_size):
        fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix==i)))

    figure_name = "visualization_" + str(np.size(features_matrix, axis=1))
    figure_path = os.path.join(os.path.split(options.visualization_path)[0],figure_name)
    CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                               features_matrix=features_matrix,
                               labels_matrix=labels_matrix,
                               label_size=options.label_size,
                               figure_path = figure_path)

    fr.write('\n figure_path: {}\n'.format(figure_path))
    fr.write(' clustering_center_distance_sim: {}\n'.format(CCD))
    fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
    fr.close()
    logger.info('eval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
Example #7
0
def eval_online(options):
    visual_dir = os.path.split(options.visualization_path)[0]
    if not utils.check_rebuild(visual_dir, descrip='visualization', always_rebuild=options.always_rebuild):
        return
    if not os.path.exists(visual_dir):
        os.makedirs(visual_dir)

    # print logger
    logger.info('eval case: visualization...')
    logger.info('\t data_dir = {}'.format(options.data_dir))
    logger.info('\t data_name = {}'.format(options.data_name))
    logger.info('\t isdirected = {}'.format(options.isdirected))
    logger.info('\t label_path = {}'.format(options.label_path))
    logger.info('\t label_size = {}'.format(options.label_size))
    logger.info('\t eval_node_type: {}'.format(options.eval_node_type))
    logger.info('\t save_dir: {}\n'.format(visual_dir))
    logger.info('\t method: t-SNE')
    logger.info('\t multilabel_rule: {}'.format(options.multilabel_rule))
    logger.info('\t marker_size: {}'.format(options.marker_size))
    logger.info('\t eval_online: {}'.format(options.eval_online))
    logger.info('\t eval_interval: {}s'.format(options.eval_interval))


    logger.info('\t reading labeled data from file {}'.format(options.label_path))
    # get embedding vectors and markersize
    time_start = time.time()
    id_list_totoal, labels_list_totoal = utils.get_labeled_data(options.label_path, type=options.eval_node_type,
                                                                multilabel_rule=options.multilabel_rule,
                                                                type_filepath=os.path.join(options.data_dir,
                                                                                           options.data_name + ".nodes"))
    logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))

    logger.info('\t total labeled data size: {}'.format(len(id_list_totoal)))
    logger.info('\t total labels size: {}'.format(options.label_size))


    fr_total = open(options.visualization_path, 'w')
    fr_total.write('eval case: visualization...\n')
    fr_total.write('\t data_dir = {}\n'.format(options.data_dir))
    fr_total.write('\t data_name = {}\n'.format(options.data_name))
    fr_total.write('\t isdirected = {}\n'.format(options.isdirected))
    fr_total.write('\t label_path = {}\n'.format(options.label_path))
    fr_total.write('\t label_size = {}\n'.format(options.label_size))
    fr_total.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
    fr_total.write('\t save_dir: {}\n\n'.format(visual_dir))
    fr_total.write('\t method: t-SNE\n')
    fr_total.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
    fr_total.write('\t marker_size: {}\n'.format(options.marker_size))
    fr_total.write('\t eval_online: {}\n'.format(options.eval_online))
    fr_total.write('\t eval_interval: {}s\n'.format(options.eval_interval))
    fr_total.write('\t total labeled data size: {}\n'.format(len(id_list_totoal)))
    fr_total.write('\t total labels size: {}\n'.format(options.label_size))
    fr_total.write('\t results(CCD-clustering_center_distance_sim):\n'
                   '=============================================================\n')
    fr_total.write('finish_time\tckpt\tCCD\n')


    last_step = 0
    summary_writer = tf.summary.FileWriter(visual_dir, tf.Graph())
    summary = tf.Summary()
    summary.value.add(tag='CCD', simple_value=0.)
    summary_writer.add_summary(summary, last_step)

    best_CCD = 0

    ckpt_dir = os.path.join(os.path.split(options.vectors_path)[0], 'ckpt')
    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
    while (not (ckpt and ckpt.model_checkpoint_path)):
        logger.info("model and vectors not exist, waiting...")
        time.sleep(options.eval_interval)
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)

    reading = options.vectors_path + ".reading_visualization_{}".format(options.eval_node_type)
    writing = options.vectors_path + ".writing"

    while (options.eval_online):
        while True:
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                if os.path.exists(os.path.join(os.path.split(options.vectors_path)[0], "RUN_SUCCESS")):
                    return
                time.sleep(options.eval_interval)
                continue
            # ready for reading
            logger.info("\t declare for reading ...")
            open(reading, "w")  # declare
            time.sleep(30)
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            cur_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            if cur_step <= last_step or (not os.path.exists(options.vectors_path)) or os.path.exists(writing):
                os.remove(reading)  # undeclare
                logger.info("\t confliction! undeclare and waiting ...")
                time.sleep(options.eval_interval)
                continue
            break
        logger.info("\t eval ckpt-{}.......".format(cur_step))
        time_start = time.time()
        logger.info('\t reading embedding vectors from file {}'.format(options.vectors_path))
        id_list, features_matrix, labels_list = utils.get_vectors(utils.get_KeyedVectors(options.vectors_path),
                                                         id_list_totoal, labels_list_totoal)
        os.remove(reading)  # synchrolock for multi-process
        logger.info("\t done for reading ...")
        labels_matrix = np.array([item[0] for item in labels_list])
        logger.info('\t reading labeled data completed in {}s'.format(time.time() - time_start))
        logger.info('\t total labeled data size: {}'.format(np.size(features_matrix, axis=0)))
        logger.info('\t total labels size: {}'.format(options.label_size))
        for i in range(options.label_size):
            logger.info('\t\t label {}: {}'.format(i, np.sum(labels_matrix == i)))

        # visualization
        fr = open(options.visualization_path + '.{}'.format(cur_step), 'w')
        fr.write('eval case: visualization...\n')
        fr.write('\t data_dir = {}\n'.format(options.data_dir))
        fr.write('\t data_name = {}\n'.format(options.data_name))
        fr.write('\t isdirected = {}\n'.format(options.isdirected))
        fr.write('\t label_path = {}\n'.format(options.label_path))
        fr.write('\t label_size = {}\n'.format(options.label_size))
        fr.write('\t eval_node_type: {}\n'.format(options.eval_node_type))
        fr.write('\t method: t-SNE\n')
        fr.write('\t multilabel_rule: {}\n'.format(options.multilabel_rule))
        fr.write('\t marker_size: {}\n'.format(options.marker_size))
        fr.write('\t eval_online: {}\n'.format(options.eval_online))
        fr.write('\t eval_interval: {}s\n'.format(options.eval_interval))
        fr.write('\t total labeled data size: {}\n'.format(np.size(features_matrix, axis=0)))
        fr.write('\t total labels size: {}\n'.format(options.label_size))
        for i in range(options.label_size):
            fr.write('\t\t label {}: {}\n'.format(i, np.sum(labels_matrix == i)))

        fr_total.write('%s ckpt-%-9d: ' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), cur_step))
        summary = tf.Summary()

        figure_name = "visualization_" + str(np.size(features_matrix, axis=1)) + '.{}'.format(cur_step)
        figure_path = os.path.join(visual_dir, figure_name)
        CCD = plot_embedding_in_2D(Markersize=options.marker_size,
                                   features_matrix=features_matrix,
                                   labels_matrix=labels_matrix,
                                   label_size=options.label_size,
                                   figure_path=figure_path)

        fr.write('\n figure_path: {}\n'.format(figure_path))
        fr.write(' clustering_center_distance_sim:{}\n'.format(CCD))
        fr.write('\neval case: visualization completed in {}s\n ======================'.format(time.time() - time_start))
        fr.close()

        fr_total.write('%.4f\n' % CCD)
        fr_total.flush()
        summary.value.add(tag='CCD', simple_value=CCD)
        summary_writer.add_summary(summary, cur_step)
        summary_writer.flush()
        logger.info('visualization completed in {}s\n================================='.format(time.time() - time_start))

        # copy ckpt-files according to last mean_Micro_F1 (0.9 ratio).
        if CCD > best_CCD:
            best_CCD = CCD

            ckptIsExists = os.path.exists(os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step))
            if ckptIsExists:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'w')
            else:
                fr_best = open(os.path.join(visual_dir, 'best_ckpt.info'), 'a')
                fr_best.write("Note:the model.ckpt-best is the remainings of last best_ckpt!\n"
                              "the current best_ckpt model is loss, but the result is:\n")
            fr_best.write("best_CCD: {}\n".format(best_CCD))
            fr_best.write("best_ckpt: ckpt-{}\n".format(cur_step))
            fr_best.close()

            if ckptIsExists:
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.data-00000-of-00001' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.data-00000-of-00001')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.index' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.index')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
                sourceFile = os.path.join(ckpt_dir, 'model.ckpt-%d.meta' % cur_step)
                targetFile = os.path.join(visual_dir, 'model.ckpt-best.meta')
                if os.path.exists(targetFile):
                    os.remove(targetFile)
                shutil.copy(sourceFile, targetFile)
        last_step = cur_step

    fr_total.close()
    summary_writer.close()
    return