def __init__(self, client, pool, directory):
     self.result_set = None
     self.linked_pbar = None
     self.client = client
     self.pool = pool
     self.directory = directory
     self.files = natural_sort(filter(is_supported_file, glob.glob(os.path.join(directory, '*'))))
def tag_forcedandexisting_art(availableart, forcedart, existingart):
    typeinsert = {}
    for exacttype, artlist in sorted(forcedart.iteritems(), key=lambda arttype: natural_sort(arttype[0])):
        arttype = mediainfo.get_basetype(exacttype)
        if arttype not in availableart:
            availableart[arttype] = artlist
        else:
            for image in artlist:
                match = next((available for available in availableart[arttype] if available['url'] == image['url']), None)
                if match:
                    if 'title' in image and 'title' not in match:
                        match['title'] = image['title']
                    match['second provider'] = image['provider'].display
                else:
                    typeinsert[arttype] = typeinsert[arttype] + 1 if arttype in typeinsert else 0
                    availableart[arttype].insert(typeinsert[arttype], image)

    typeinsert = {}
    for exacttype, existingurl in existingart.iteritems():
        arttype = mediainfo.get_basetype(exacttype)
        if arttype in availableart:
            match = next((available for available in availableart[arttype] if available['url'] == existingurl), None)
            if match:
                match['preview'] = existingurl
                match['existing'] = True
            else:
                typeinsert[arttype] = typeinsert[arttype] + 1 if arttype in typeinsert else 0
                image = {'url': existingurl, 'preview': existingurl, 'title': exacttype,
                    'existing': True, 'provider': SortedDisplay('current', L(CURRENT_ART))}
                availableart[arttype].insert(typeinsert[arttype], image)
Beispiel #3
0
def write_motion(mesh_dir,  start_point, intpl_num, output_dir, num_cycle, duration, debug=False, mode='displacement'):
    fns = utils.natural_sort(glob.glob(os.path.join(mesh_dir, "*.vtp")))
    total_num_phase = len(fns)
    total_steps = num_cycle* total_num_phase * (intpl_num+1)+1
    initialized = False
    time_pts = np.linspace(0,num_cycle*duration, total_steps)
    
    poly_template = io_utils.read_vtk_mesh(fns[start_point])
    
    displacements = move_mesh(fns, start_point, intpl_num, num_cycle)
    if debug:
        import vtk
        debug_dir = os.path.join(output_dir,"Debug")
        try:
            os.makedirs(debug_dir)
        except Exception as e: print(e)
        coords = vtk_to_numpy(poly_template.GetPoints().GetData())
        poly = vtk.vtkPolyData()
        poly.DeepCopy(poly_template)
        for ii in range(displacements.shape[-1]):
            poly.GetPoints().SetData(numpy_to_vtk(displacements[:,:,ii]+coords))
            fn_debug = os.path.join(debug_dir, "debug%05d.vtp" %ii)
            io_utils.write_vtk_polydata(poly, fn_debug)

    node_ids = vtk_to_numpy(poly_template.GetPointData().GetArray('GlobalNodeID'))
    face_ids = vtk_to_numpy(poly_template.GetCellData().GetArray('ModelFaceID'))
    #write time steps and node numbers
    for face in np.unique(face_ids):
        if mode=='displacement':
            fn = os.path.join(output_dir, '%d_displacement.dat' % face)
        elif mode=='velocity':
            fn = os.path.join(output_dir, '%d_velocity.dat' % face)
        else:
            raise ValueError('Unsupported boundary type {}; should be displacement or velocity.'.format(mode))
        face_poly = utils.threshold_polydata(poly_template, 'ModelFaceID', (face,face))
        f = open(fn, 'w')
        f.write('{} {} {}\n'.format(3, total_steps,face_poly.GetNumberOfPoints()))
        for t in time_pts:
            f.write('{}\n'.format(t))
        #f.write('{}\n'.format(face_poly.GetNumberOfPoints()))
        face_ids = vtk_to_numpy(face_poly.GetPointData().GetArray('GlobalNodeID'))
        node_id_index = find_index_in_array(node_ids, face_ids)
        for i in node_id_index:
            disp = displacements[i, :, :]
            f.write('{}\n'.format(node_ids[i]))
            for j in range(total_steps):
                if mode=='displacement':
                    f.write('{} {} {}\n'.format(disp[0,j], disp[1,j],disp[2,j]))
                elif mode=='velocity':
                    f.write('{} {} {}\n'.format(disp[0,j]/(time_pts[1]-time_pts[0]), disp[1,j]/(time_pts[1]-time_pts[0]),disp[2,j]/(time_pts[1]-time_pts[0])))
        f.close()
Beispiel #4
0
    def all_files_in_folder(self, dir_path, type, minify=True):
        type = type.lower()
        minifier = Minfier()
        if type not in self.allowed_ext:
            return

        result = ''
        for f in natural_sort(os.listdir(dir_path)):
            with open(os.path.join(dir_path, f), 'r') as s:
                fdata = s.read()
            if minify:
                fdata = minifier.minify(fdata, type)
            result += fdata

        with open(os.path.join(dir_path, 'joined.' + type),
                  'w',
                  encoding='utf-8') as f:
            f.write(result)
    def load_model_input(self, input_file, synthetic=False, limit_n=-1, verbose=False):
        """ Load everything that a clustering model requires """

        # if this is a directory, process all files inside
        if os.path.isdir(input_file):

            # find all the .txt and csv files in input_dir
            input_dir = input_file
            filelist = []
            types = ('*.csv', '*.txt')
            starting_dir = os.getcwd() # save the initial dir to restore
            os.chdir(input_dir)
            for files in types:
                filelist.extend(glob.glob(files))
            filelist = utils.natural_sort(filelist)
            self.file_list = filelist
            
            # load the files
            file_id = 0
            data_list = []
            all_features = []
            for file_path in filelist:
                features, corr_mat = self.load_features(file_path, file_id, synthetic=synthetic)
                file_id += 1
                if limit_n > -1:
                    print "Using only " + str(limit_n) + " features from " + file_path
                    features = features[0:limit_n]
                data = PeakData(features, file_path, corr_mat=corr_mat)
                all_features.extend(features)
                data_list.append(data)
                sys.stdout.flush()
            os.chdir(starting_dir)                                
            return data_list
                    
        else:   
                     
            # process only a single file
            file_id = 0
            features, corr_mat = self.load_features(input_file, file_id, synthetic=synthetic)
            if limit_n > -1:
                features = features[0:limit_n]
            data = PeakData(features, input_file, corr_mat=corr_mat)
            return data
    def load_model_input(self, input_file, corr_rt_window, verbose=False):
        """ Load everything that a clustering model requires """

        # if this is a directory, process all files inside
        # input_file = os.path.abspath(input_file)
        # print os.path.isdir(input_file)

        if os.path.isdir(input_file):

            print 'Loading files from', input_file

            # find all the .txt and csv files in input_dir
            input_dir = input_file
            filelist = []
            types = ('*.csv', '*.txt')
            starting_dir = os.getcwd()  # save the initial dir to restore
            os.chdir(input_dir)
            for files in types:
                filelist.extend(glob.glob(files))
            filelist = utils.natural_sort(filelist)
            self.file_list = filelist

            # load the files
            file_id = 0
            data_list = []
            all_features = []
            for file_path in filelist:
                full_path = os.path.abspath(file_path)
                features, corr_adjacency = self.load_features(
                    full_path, file_id, corr_rt_window)
                file_id += 1
                data = PeakData(features, file_path, corr_mat=corr_adjacency)
                all_features.extend(features)
                data_list.append(data)
                sys.stdout.flush()
            os.chdir(starting_dir)
            return data_list

        else:

            print input_file, 'must be a directory containing the input file'
    def load_model_input(self, input_file, corr_rt_window, verbose=False):
        """ Load everything that a clustering model requires """

        # if this is a directory, process all files inside
        # input_file = os.path.abspath(input_file)
        # print os.path.isdir(input_file)

        if os.path.isdir(input_file):

            print 'Loading files from', input_file

            # find all the .txt and csv files in input_dir
            input_dir = input_file
            filelist = []
            types = ('*.csv', '*.txt')
            starting_dir = os.getcwd() # save the initial dir to restore
            os.chdir(input_dir)
            for files in types:
                filelist.extend(glob.glob(files))
            filelist = utils.natural_sort(filelist)
            self.file_list = filelist

            # load the files
            file_id = 0
            data_list = []
            all_features = []
            for file_path in filelist:
                full_path = os.path.abspath(file_path)
                features, corr_adjacency = self.load_features(full_path, file_id, corr_rt_window)
                file_id += 1
                data = PeakData(features, file_path, corr_mat=corr_adjacency)
                all_features.extend(features)
                data_list.append(data)
                sys.stdout.flush()
            os.chdir(starting_dir)
            return data_list

        else:

            print input_file, 'must be a directory containing the input file'
Beispiel #8
0
def main():
    results_dir = 'results'
    base_path = '/home/maurice/iwgn_multivariate/{}'.format(results_dir)
    plot_path = os.path.join(base_path, 'plots')
    if not os.path.exists(base_path):
        sys.exit('Need a results dir to read from.')
    if not os.path.exists(plot_path):
        os.mkdir(plot_path)

    # Run evaluation for each dimension separately.
    for data_dim in [2, 4, 10]:
        # Set up datafram to hold all run info, and get run names.
        df = pd.DataFrame()
        model_run_names = [
            n for n in os.listdir(base_path)
            if (('dim' + str(data_dim) in n) and ('run' in n))
        ]
        if len(model_run_names) == 0:
            print('No model run names found for dim{}'.format(data_dim))
            continue

        # For each model in the desired set, add the performance data to an array.
        for model_run_name in model_run_names:
            run_path = os.path.join(base_path, model_run_name)
            filename = os.path.join(run_path, 'perf.txt')

            # Extract performance info from the log file.
            run_performance = np.loadtxt(
                filename,
                dtype={
                    'names': ('model_type', 'model_subtype', 'dim', 'run',
                              'step', 'g_loss', 'mmd2_v', 'energy_v', 'kl_v',
                              'mmd2_t', 'energy_t', 'kl_t', 'time'),
                    'formats': ('|S24', '|S24', np.int, np.int, np.int,
                                np.float, np.float, np.float, np.float,
                                np.float, np.float, np.float, np.float)
                },
                delimiter=',',
                skiprows=0)
            if len(run_performance.shape) == 0:
                run_performance = np.atleast_1d(run_performance)
            assert isinstance(run_performance, np.ndarray), (
                'file info must '
                'come as list, probably need more than one result per run.')

            # Add row to df.
            for row in run_performance:
                # Add row to dataframe.
                model = '{}_{}'.format(row['model_type'], row['model_subtype'])
                row_df = pd.DataFrame([[
                    model, row['model_type'], row['model_subtype'], row['dim'],
                    row['run'], row['step'], row['g_loss'], row['mmd2_v'],
                    row['energy_v'], row['kl_v'], row['mmd2_t'],
                    row['energy_t'], row['kl_t'], row['time']
                ]],
                                      columns=[
                                          'model', 'model_type',
                                          'model_subtype', 'dim', 'run',
                                          'step', 'g_loss', 'mmd2_v',
                                          'energy_v', 'kl_v', 'mmd2_t',
                                          'energy_t', 'kl_t', 'time'
                                      ])
                df = df.append(row_df, ignore_index=True)

        ##############################################
        ##############################################
        # PLOT PERFORMANCE ASSOCIATED WITH BEST VALIDATION DISCREPANCY

        # For each model, find run that performed best, based on validation.
        model_names = df.model.unique()
        performance_per_model = []

        for model_name in model_names:
            # Get all runs for that model, e.g. 10 runs.
            runs = natural_sort(
                [n for n in model_run_names if n.startswith(model_name)])

            # For each run, compute average g_loss and discrepancy measure in a
            # tail of available log steps. A run_outcomes array will store
            # g_loss and measure per run.
            tail = 2
            run_outcomes = np.zeros((len(runs), 2))
            for i, run_name in enumerate(runs):
                # Subset dataframe for this run.
                dim = int(run_name.split('_')[2].replace('dim', ''))
                run_num = int(run_name.split('_')[3].replace('run', ''))
                df_run_subset = df.loc[df['model'] == model_name]
                df_run_subset = df_run_subset.loc[df['dim'] == dim]
                df_run_subset = df_run_subset.loc[df['run'] == run_num]

                # Get tail performance for this run.
                validation_loss_ = np.mean(
                    df_run_subset[measure_str_v][-tail:])
                reported_loss_ = np.mean(df_run_subset[measure_str_t][-tail:])

                # Store performance for this run.
                run_outcomes[i] = [validation_loss_, reported_loss_]

            # For this model, get best validation_loss_ among runs, and store
            # associated name and reported_loss. Also store average
            # reported_loss among runs.
            best_validation_run_idx = np.argmin(run_outcomes[:, 0])
            best_validation_name = runs[best_validation_run_idx]
            best_validation_reported_loss = \
                run_outcomes[best_validation_run_idx, 1]
            avg_model_loss = np.mean(run_outcomes[:, 1])

            # Store final info for this model.
            performance_per_model.append([
                best_validation_name, best_validation_reported_loss,
                avg_model_loss
            ])

        # Plot performance per model.
        fig = plt.figure()
        for model_perf in performance_per_model:
            n = best_validation_name = model_perf[0]
            best_validation_reported_loss = model_perf[1]
            avg_model_loss = model_perf[2]

            model_name = n.split('_dim')[0]
            dim = int(n.split('_')[2].replace('dim', ''))
            run_num = int(n.split('_')[3].replace('run', ''))
            df_run_subset = df.loc[df['model'] == model_name]
            df_run_subset = df_run_subset.loc[df['dim'] == dim]
            df_run_subset = df_run_subset.loc[df['run'] == run_num]

            # Plot the test loss for the best validation model.
            plt.plot(df_run_subset.step,
                     df_run_subset[measure_str_t],
                     label=model_name)

        plt.xlabel('Step')
        plt.ylabel(measure)
        plt.legend()
        plt.savefig(
            os.path.join(base_path, 'plots',
                         'best_perf_dim{}.png'.format(data_dim)))
        plt.close()

        print(df.shape)
        for p in performance_per_model:
            print(('best_validation_reported_loss, avg_model_loss: '
                   '{:.5f}, {:.5f}, model: {}').format(p[1], p[2], p[0]))

        ####################################
        # PLOT JUST CE, JUST MMD, and ALL.

        #for measure in ['mmd', 'energy', 'kl']:
        #run_boxplots(df, measure, data_dim, case='special')
        #run_boxplots(df, measure, data_dim, case='ce')
        #run_boxplots(df, measure, data_dim, case='ce_iw_vs_up')
        #run_boxplots(df, measure, data_dim, case='no_cgan')

        email = 0
        if email:
            # The CE results.
            os.system(('echo "{}" | mutt [email protected] -s '
                       '"test_upsample" '
                       '-a results/plots/boxplot_ce_d{}_loss_energy.png '
                       '-a results/plots/boxplot_ce_d{}_loss_kl.png '
                       '-a results/plots/boxplot_ce_d{}_loss_mmd.png '
                       '-a results/plots/boxplot_ce_d{}_time.png').format(
                           os.getcwd(), data_dim, data_dim, data_dim,
                           data_dim))
Beispiel #9
0
def sequential_input(params, global_step=None, eval=False):
    """
    Input fn that reads tfrecords encoded with a fixed chunk size (== n_ctx + 1), and that either:

        - has the number of documents for each tfrecord file encoded in the title in the format
          <name>_<n_documents>.tfrecords.

          OR

        - has a fixed number of documents per tfrecord file.

    If the glob pattern above isn't matched, we assume that each document has the same number of samples as the first tfrecord read.
    If this isn't the case, it may result in errors, or some samples being missed.

    This means we can calculate the number of samples we've seen so far using the global step,
    and can use dataset.skip() to iterate through the list of filenames, as opposed to the whole dataset, which is incredibly inefficient.

    If training is starting and stopping often, as with TPU pre-emption, reading the whole dataset sequentially appears to improve model
    performance, as it results in less repeated data.
    """
    if not eval:
        assert global_step is not None
    logging.warning(
        "Changing batch size with sequential_input() will result in some data being skipped or repeated. Please ensure your batch size stays constant throughout training.")
    batch_size = params['eval_batch_size' if eval else 'train_batch_size']

    filenames = []
    for dataset_config in params['dataset_configs'].values():  # iterate through each dataset and read params
        path_key = 'path' if not eval else 'eval_path'
        path = dataset_config[path_key]
        filenames.extend(
            tf.io.gfile.glob(path))  # then glob all files that fit the pattern specified in dataset_configs

    filenames = natural_sort(filenames)
    shuffle_filenames = params.get("shuffle_input_filenames", True)
    if shuffle_filenames:
        seed = params.get('seed', 1)  # shuffle deterministically
        random.seed(seed)
        random.shuffle(filenames)

    dataset = tf.data.Dataset.from_tensor_slices(filenames).repeat()  # repeat filenames to infinity

    if not eval:
        # skip forward first in the filenames list, then skip the remaining amount in the parsed tfrecords files
        skip_idx, remainder = _get_skip_index(filenames, n_batches=global_step * params[
            "train_batch_size"])  # TODO: fix for > 1 epoch
        dataset = dataset.skip(skip_idx)  # skip to skip idx

        # read tfrecord examples and skip remainder
        dataset = dataset.apply(tf.data.TFRecordDataset)
        dataset = dataset.skip(remainder)
    else:
        # shuffle filenames if in eval mode
        dataset = dataset.shuffle(len(filenames))
        dataset = dataset.apply(tf.data.TFRecordDataset)

    # parse the tokenized data from the tfrecord files and shuffle
    dataset = dataset.map(_parse_function, num_parallel_calls=1)
    dataset = dataset.map(partial(autoregressive_sample_text, params), num_parallel_calls=1)

    # batch data and repeat to infinity
    dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(params["iterations"] * 2)
    return dataset.repeat()
Beispiel #10
0
if __name__ == '__main__':
    from utils import natural_sort
    start = time.time()
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--input_dir', help="Path to the surface meshes")
    parser.add_argument('--output_dir', help="Path to the volume meshes")
    parser.add_argument('--model_out', help="Name format of surface")
    parser.add_argument('--edge_size', type=float, help="Maximum edge size of the volumetric mesh.")
    parser.add_argument('--phase', default=-1, type=int, help="Id of the phase to generate volume mesh")
    args = parser.parse_args()
    
    input_dir = args.input_dir

    if args.phase == -1:
        try:
            volume_fn = np.load(os.path.join(input_dir, "volume.npy"))
            phase = volume_fn[:,0][int(np.argmax(volume_fn[:,1]))]
        except:
            print("Mesh volumes not found, the first model will be meshed")
            phase = 0
    else:
        phase = args.phase
    surface_fns = natural_sort(glob.glob(os.path.join(args.input_dir, '*.vtp')))
    poly_fn = surface_fns[int(phase)]
    
    create_volume_mesh(poly_fn, args.edge_size, args.output_dir)
    end = time.time()
    print("Time spent in volume_mesh_main.py: ", end-start)
    print("Mesh generated for ", poly_fn)
Beispiel #11
0
def main():

    # Request data from all parties simultaneously
    num_threads = 3
    with ThreadPoolExecutor(max_workers=num_threads) as pool:
        parties = ['ensembl', 'ucsc', 'genomaize']
        for result in pool.map(pool_processing, parties):
            party = result[0]
            if party == 'genomaize':
                zea_mays_centromeres = result[1]
            else:
                party_list.append(result)

    logger.info('')
    logger.info('UCSC databases not mapped to GenBank assembly IDs:')
    logger.info(', '.join(unfound_dbs))
    logger.info('')

    # Third parties (e.g. UCSC, Ensembl) can have data for the same organism.
    # Convert any such duplicate data into a non-redundant (NR) organism map.
    nr_org_map = {}
    seen_orgs = {}
    for party, org_map in party_list:
        logger.info('Iterating organisms from ' + party)
        for org in org_map:
            logger.info('\t' + org)
            if org in seen_orgs:
                logger.info('Already saw ' + org)
                continue
            nr_org_map[org] = org_map[org]

    manifest = {}

    for org in nr_org_map:

        asm_data = sorted(nr_org_map[org], reverse=True)[0]
        genbank_accession, db, bands_by_chr = asm_data

        manifest[org] = [genbank_accession, db]

        if org == 'drosophila-melanogaster':
            bands_by_chr = patch_telomeres(bands_by_chr)

        # Assign cytogenetic arms for each band
        if org == 'zea-mays':
            bands_by_chr = merge_centromeres(bands_by_chr,
                                             zea_mays_centromeres)
        else:
            bands_by_chr = parse_centromeres(bands_by_chr)

        # Collapse chromosome-to-band dict, making it a list of strings
        band_list = []
        chrs = natural_sort(list(bands_by_chr.keys()))
        for chr in chrs:
            bands = bands_by_chr[chr]
            for band in bands:
                band_list.append(chr + ' ' + ' '.join(band))

        # Write actual cytoband data to file,
        # e.g. ../data/bands/native/anopheles-gambiae.js
        with open(output_dir + org + '.js', 'w') as f:
            f.write('window.chrBands = ' + str(band_list))

    logger.info('')

    # How long did each part take?
    logger.info('time_ucsc:')
    logger.info(time_ucsc)
    logger.info('time_ncbi:')
    logger.info(time_ncbi)
    logger.info('time_ensembl:')
    logger.info(time_ensembl)

    return manifest
Beispiel #12
0
        #ASSUMING increment is 1
        fn_poly = os.path.join(output_dir, os.path.basename(moving_im_fn)+'.vtp')
        new_lvmodel.write_surface_mesh(fn_poly)
        volume.append([(index+1)%TOTAL_PHASE,new_lvmodel.get_volume()])

    np.save(os.path.join(output_dir, "volume.npy"), volume)
    return

if __name__=='__main__':
    start = time.time()
    import argparse
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--image_dir', help='Path to the ct/mr images or segmentation results')
    parser.add_argument('--mask_dir', help='Path to the mask file')
    parser.add_argument('--surface_dir', help='Path to the unregistered surface meshes')
    parser.add_argument('--output_dir', help='Path to the registered surface meshes')
    parser.add_argument('--start_phase', type=int, help='Phase ID of the surface mesh used as the registration target')
    parser.add_argument('--edge_size', type=float, help='Maximum edge size of the surface mesh')
    parser.add_argument('--image_file_extension', default='nii.gz', help='Extension of the images or segmentation results')
    args = parser.parse_args()
    
    #
    image_fns = natural_sort(glob.glob(os.path.join(args.image_dir, '*.'+args.image_file_extension)))
    mask_fns = natural_sort(glob.glob(os.path.join(args.mask_dir, '*.'+args.image_file_extension)))
    surface_fns = natural_sort(glob.glob(os.path.join(args.surface_dir, '*.vtp')))
    lvmodel = LeftVentricle(io_utils.read_vtk_mesh(surface_fns[args.start_phase]), edge_size=args.edge_size )
    registration(lvmodel, args.start_phase,image_fns,  args.output_dir, mask_fns)
    end = time.time()
    print("Time spent in elastix_main.py: ", end-start)