Example #1
0
def main(args):
    #-----------------------------------------------------#
    #             2D/3D Convolutional Autoencoder         #
    #-----------------------------------------------------#
    if args.program == 'CAE':
        cae = CAE(input_dir=args.data_dir,
                  patch_size=ast.literal_eval(args.patch_size),
                  batch_size=args.batch_size,
                  test_size=args.test_size,
                  prepare_batches=args.prepare_batches)

        cae.prepare_data(args.sampler_type, args.max_patches, args.resample,
                         ast.literal_eval(args.patch_overlap),
                         args.min_lab_vox, args.label_prob, args.load_data)
        if args.model_dir is None:
            cae.train(args.epochs)
        cae.predict(args.model_dir)

    #-----------------------------------------------------#
    #               Patient classification                #
    #-----------------------------------------------------#
    """
    if args.program=='AutSeg':
        asg = AutomaticSegmentation(    model_name=args.model_name,
                                        patch_size=args.patch_size,
                                        patch_overlap=args.patch_overlap,
                                        input_dir=args.data_dir, 
                                        model_dir=args.model_dir   )
        asg.run()
        asg.run_postprocessing()

"""
    if args.program == 'CLUS':
        clustering = Clustering(num_iters=args.iterations,
                                num_clusters=args.num_clusters,
                                input_dir=args.data_dir)
        clustering.run()

    if args.program == 'FeEx':
        fe = FeatureExtraction(model_name=args.model_name,
                               patch_size=ast.literal_eval(args.patch_size),
                               patch_overlap=ast.literal_eval(
                                   args.patch_overlap),
                               num_clusters=args.num_clusters,
                               cluster_selection=args.cluster_selection,
                               resample=args.resample,
                               encoded_layer_num=args.encoded_layer_num,
                               model_dir=args.model_dir,
                               input_dir=args.data_dir)
        fe.run(batch_size=20)

    if args.program == 'SVM':
        svm = SvmClassifier(feature_dir=args.feature_dir,
                            ffr_dir=args.ffr_dir,
                            ffr_filename=args.ffr_filename,
                            input_dir=args.data_dir,
                            ffr_cut_off=args.ffr_cut_off,
                            test_size=args.test_size)
        svm.train()
        svm.predict()
Example #2
0
    def generate(self, keys, url):
        json_work("other_files/work_file.json", "w", [])  # обнуляем work

        print(f'Ключей получено: {len(keys)}')

        if len(keys) > 0:
            self.generate_pretmp(
                keys
            )  # генерация претемплейтов по ключам c уникальным stemming
            print(f'Ключей после удаления дублей: {len(self.work_file)}')
            time.sleep(2)
            if len(self.work_file) > 0:
                with ThreadPoolExecutor(5) as executor:
                    for _ in executor.map(self.template_generated,
                                          self.work_file):
                        pass
                work = json_work("other_files/work_file.json", "r")
                if len(work) > 0:
                    gen_data = sorted(work,
                                      key=lambda x: x["frequency"]["basic"],
                                      reverse=True)
                    json_work("other_files/work_file.json", "w", gen_data)
                    gen_data += json_work("other_files/main.json", "r")
                    gen_data = sorted(gen_data,
                                      key=lambda x: x["frequency"]["basic"],
                                      reverse=True)
                    json_work("other_files/main.json", "w", gen_data)
                    print(f"url {url} обработан")
                    clustering = Clustering(
                        json_work("other_files/work_file.json", "r"), url)
                    clustering.run()
            else:
                print("Перехожу к следующему url")
        return
Example #3
0
def run_pipeline(data,
                 y,
                 dataset_id,
                 evaluate=False,
                 verbose=False,
                 optimize_method=False,
                 **kwargs):
    """
    It performs the basic logic pipeline for getting the data, creates blocking and clustering
    and stores the matching pairs.

    :param data: pd.DataFrame, the input dataset
    :param y: pd.DataFrame, the dataset with the actual pairs
    :param dataset_id: int, the id of the input dataset
    :param evaluate: boolean, if evaluation should be run
    :param verbose: boolean, if logging
    :return: pd.DataFrame, with the predicted matching pairs
    """
    cluster_n = kwargs.get('cluster_num', 0)
    distance_threshold = kwargs.get('distance_threshold', 0)
    method = kwargs.get('clustering_method', 'agglomerative')
    encoding = kwargs.get('encoding', 'use')

    # blocking
    if 'title' in data.columns:  # Instantiate correct blocker based on dataset
        blocker = X2Blocker()
        # identify X3
        if optimize_method and ("source" in data.instance_id[0]):
            method = 'cosine'
    elif 'name' in data.columns:
        blocker = X4Blocker()
        data["title"] = data["name"]
        if optimize_method:
            method = 'agglomerative'
            cluster_n = 0
    else:
        raise ValueError("Please add a valid dataset id")

    blocker.fit(data=data)
    # blocks is of type [[instance_id]]
    # list of lists of instance_ids belonging to same group
    # transform returns modified data frame for further use
    blocks, data = blocker.transform()

    # apply clustering for each block to get matching pairs
    clusters = list()

    print("Method: ", method)
    print("Encoding: ", encoding, "\n")
    cls = Clustering(method=method,
                     cluster_n=cluster_n,
                     distance_threshold=distance_threshold,
                     encoding=encoding)
    for block in blocks:
        if len(block) > 1:
            # filter data based on the instance_id's presented in the block
            block_df = data[data['instance_id'].isin(block)]
            clusters_l = cls.run(block_df)

            for c in clusters_l:
                clusters.append(c)

    # create pairs from clusters
    pairs_pred_df = create_pairs(clusters)

    dataset_scores = dict()
    if evaluate:
        # run performance evaluation
        dataset_scores = get_scores(actual=y, pred=pairs_pred_df)
        if verbose:
            print('Precision: {:.3f}'.format(
                dataset_scores['precision_score']))
            print('Recall: {:.3f}'.format(dataset_scores['recall_score']))
            print('F1 score: {:.3f}'.format(dataset_scores['f1_score']))

        dataset_scores['cluster_n'] = cluster_n
        dataset_scores['method'] = method
        dataset_scores['dataset'] = dataset_id
        dataset_scores['encoding'] = encoding
        dataset_scores['threshold'] = distance_threshold

    return pairs_pred_df, dataset_scores