Esempio n. 1
0
    def __init__(self, image_repository, path_columns, savefile,
                 registrator=None, n_processes=mp.cpu_count(), debug=False):

        self.debug = debug
        print('initializing analysis...',
              '\timage repository:\t{}'.format(image_repository),
              '\tpath columns:\t{}'.format(path_columns),
              '\tsavefile:\t{}'.format(savefile),
              '\tprocesses:\t{}'.format(n_processes),
              '\tmeasurements:\{}'.format(list(MEASUREMENTS.keys())),
              '\tdenoising methods:\{}'.format(list(METHODS.keys())), sep='\n')

        self.methods = METHODS.copy()
        self.measurements = MEASUREMENTS.copy()

        self.savefile = savefile
        self.image_repository = image_repository
        self.path_columns = path_columns
        self.pool = mp.Pool(n_processes)
        self.registrator = registrator if isinstance(registrator, Registrator) else Registrator(verbose=debug, graphic=debug)
        self.denoising = None

        # make save dir if it does not exist
        save_path = os.path.dirname(self.savefile)
        if not os.path.isdir(save_path):
            os.makedirs(save_path)

        print('done!\n')
Esempio n. 2
0
    def run_workers(self, _num_work, _type, _configs, _args):
        """
        Starts the Pool of Workers and executes them.

        The method blocks until all workers have completed. However, it also starts a background update-thread which
        publishes information about progress.

        :param _num_work:   Number of workers to initialise
        :param _type:       The worker type to run
        :param _configs:    These are extensions across all workers: may be None
        :param _args:       These are arguments per-worker. Must be a list equal in length to _num_work or None
        :return:            Result of the Aggregator
        """
        # Reset Everything
        self._reset(_num_work)
        _args = _args if _args is not None else [
            None for _ in range(_num_work)
        ]

        # Prepare the Progress Bar: will automatically handle None
        self.__progress = ProgressBar(100 * _num_work, sink=self.__sink.Obj)

        # Create List of Worker Objects, and initialise thread
        _workers = [_type(_i + 1, self) for _i in range(_num_work)]
        self.__thread.start()

        # Start Pool and aggregate results
        if self.NumProc > 0:
            with mp.Pool(processes=self.NumProc) as pool:
                processes = [
                    pool.apply_async(self.__computer,
                                     args=(_workers[_i], (_configs,
                                                          _args[_i])))
                    for _i in range(_num_work)
                ]
                aggregated = self._aggregate_results(
                    [result.get() for result in processes])
        else:
            r_q = queue.Queue()
            threads = [
                threading.Thread(target=self.__threader,
                                 args=(_workers[_i], (_configs, _args[_i]),
                                       r_q)) for _i in range(_num_work)
            ]
            for thr in threads:
                thr.start()
                thr.join()
            results = []
            while not r_q.empty():
                results.append(r_q.get())
                r_q.task_done()
            aggregated = self._aggregate_results(results)

        # Inform and join thread
        self.Queue.put([0, -1])
        self.__thread.join()

        # Return the aggregated information
        return aggregated
Esempio n. 3
0
def iterate_in_parallel(method, nproc=1, iterkeys=None, **params):
    ''' evaluate a given method for a given parameter set.

        params is a dict and some of its values are allowed to be iterable.
        the method is expected to return a dict with the SAME KEYS for every parameter in one iteration.
        an exception ocurring in the method is NOT handled without stopping the iteration.
    '''
    # find the parameters to be iterated through
    iterkeys2 = [key for key in params if hasattr(params[key], "__iter__")]

    if iterkeys is None:
        iterkeys = iterkeys2
    elif set(iterkeys) <= set(iterkeys2):
        for key in iterkeys:
            iterkeys2.remove(key)
        iterkeys = iterkeys + iterkeys2
    else:
        print("I'm ignoring your iterkeys.")
        iterkeys = iterkeys2

    # create stamp of the input
    stamp = dict(params)
    stamp["iterkeys"] = iterkeys
    stamp["method"] = method.__name__

    # create list of params instances to be mapped
    iterator = combinations(params, iterkeys)

    # create the function to be mapped with
    def f(params):
        return method(**params)

    # map iterator using mpi4py
    # FIXME: doesn't work if some dolfin function are used, e.g. Function.extrapolate
    if MPI.COMM_WORLD.Get_size() > 1:
        result = mpimap(f, iterator)
    # map iterator using multiprocessing.Pool
    # FIXME: this approach of distributing across multiple processors is inconvenient
    #        since a single error kills the whole simulation.
    #        (not necessarily, error can be catched and displayed by method)
    #        also it's not supposed to be appropriate for HPC architectures
    elif nproc > 1:
        pool = mp.Pool(nproc)
        result = pool.map(f, iterator)
        pool.close()
        pool.join()
    # map in serial
    else:
        result = map(f, iterator)

    return join_dicts(result), stamp
Esempio n. 4
0
def main(patientFile, procedureFile, observationFile, conditionFile, coreNum):
        
    patientRecs, IDs = util.combineDatasets(patientFile, procedureFile, observationFile,
        conditionFile)

    patientRecCopy = list(repeat(patientRecs, coreNum))
    splitIDs = np.array_split(list(IDs), coreNum)

    pooler = mp.Pool(coreNum)

    with open('AggregatePatientData.csv', 'a') as fout:
        for result in pooler.starmap(util.AggregateQuantValues, zip(patientRecCopy, 
            splitIDs)):
            result.to_csv(fout, index=False, header=False)
Esempio n. 5
0
def gen_programs(program_len, num_programs, args):
    """
    Generates the specified amount of programs of the given length. These are the exact steps performed:
    1. Generate <num_programs> programs using gen_program_worker in a process pool
    2. Generate examples for each program by executing gen_examples_worker in a process pool.
       Discard programs for which the required amount of examples could not be generated.
    3. Return a dictionary of the form {program: examples}
    """
    progress_counter = multiprocessing.Value('i', 0)
    gen_prog_pool = multiprocessing.Pool(processes=args.num_workers,
                                         initializer=init_gen_prog_worker,
                                         initargs=(progress_counter,
                                                   num_programs, program_len))

    input_type_combinations = get_input_type_combinations(params.num_inputs)
    programs = gen_prog_pool.map(gen_program_worker, input_type_combinations)
    print('')

    # Flatten
    programs = [item for sublist in programs for item in sublist]
    programs = list(set(programs))

    # Generate examples and filter out null programs
    progress_counter.value = 0
    valid_counter = multiprocessing.Value('i', len(programs))
    gen_examples_pool = multiprocessing.Pool(
        processes=args.num_workers,
        initializer=init_gen_examples_worker,
        initargs=(progress_counter, valid_counter, len(programs),
                  args.num_examples, args.num_example_tries))

    res = gen_examples_pool.map(gen_examples_worker, programs)
    print('')
    examples = dict(zip(programs, res))
    examples = {k: v for k, v in examples.items() if v}
    return examples
Esempio n. 6
0
def ParallelRxnBuildWriteOutRd2(LabeledList,BuildFunction,RoundNum):
    
    #No chunks for round 2
    SplitList=numpy.array_split(LabeledList,16)
    
    #Fill new list of labeled cpds
    NewCpdList=[]
    
    #Build multiprocessing pool object with num of cpus
    pooler=mp.Pool(16)

    #Set up the new file
    with open('TempPandaDF.csv','w') as fp:
        
        #For each result in the Build function using an item from SplitList
        for result in pooler.imap(BuildFunction,SplitList):
            
            #Building new cpd list
            NewCpds=NewLabeledCpdList(result)
            NewCpdList.extend(NewCpds)
            
            #Each result is a Pandas Object, so write it to csv
            result.to_csv(fp,index=False,header=False)

    
    pooler.close()
    pooler.join()

    #Unique items
    NewCpdList=list(set(NewCpdList))
    
    #Write File, optional
    #CpdFile='LabeledCpds_FromRound_{0}.csv'.format(RoundNum) #Change
    #with open(CpdFile,'w') as output:
    #    writer=csv.writer(output,lineterminator='\n')
    #    for val in NewCpdList:
    #        writer.writerow([val])
    
    #Return CpdList for next round
    return(NewCpdList)
def BuildPathsCoreParallel(ResultDictionaryNum, metabname, userinputname,
                           StopDictNum):

    #The starting set of reactants that led to the product isotopomer of interest
    StartingSet = BuildOneRoundPath(ResultsList[ResultDictionaryNum],
                                    metabname)

    if len(StartingSet) > 0:

        #Starter path matrix
        SeedPath = AddOnPath(StartingSet, ResultsList[ResultDictionaryNum - 1])

        if SeedPath is not None:
            try:
                #Go one more path matrix
                SeedPath = AddOnPath(SeedPath,
                                     ResultsList[ResultDictionaryNum - 2])

                #if len(SeedPath)>48:

                SeedPathSplit = list(numpy.array_split(SeedPath, 32))

                StopDictRepeat = list(repeat(ResultDictionaryNum - 2, 32))

                pooler = mp.Pool(16)

                try:

                    with open(
                            '{0}_Paths_{1}Rxns.csv'.format(
                                userinputname, StopDictNum), 'a'
                    ) as fp:  #originally 'w' - but append for looping through shorter path lengths

                        for result in pooler.starmap(
                                AddOnPathParallel3,
                                zip(SeedPathSplit, StopDictRepeat)):
                            result.to_csv(fp, index=False, header=False)
                    pooler.close()
                    pooler.join()
                    gc.collect()

                except:

                    try:

                        #print('Go smaller - 16') #Smaller splits
                        SeedPathSplit = list(numpy.array_split(SeedPath, 16))
                        StopDictRepeat = list(
                            repeat(ResultDictionaryNum - 2, 16))
                        #pooler=mp.Pool(8)
                        #with closing(mp.Pool(8)) as pooler:
                        with open(
                                '{0}_Paths_{1}Rxns.csv'.format(
                                    userinputname, StopDictNum), 'a'
                        ) as fp:  #originally 'w' - but append for looping through shorter path lengths
                            for result in pooler.starmap(
                                    AddOnPathParallel3,
                                    zip(SeedPathSplit, StopDictRepeat)):
                                result.to_csv(fp, index=False, header=False)

                        pooler.close()
                        pooler.join()
                        gc.collect()

                    except:

                        #print('Error 1- Some of these paths cannot be connected, try another isotopologue if no results are written to csv [meaning no paths could be connected]')

                        try:
                            #print('Even Smaller - 8')
                            #Try smaller splits
                            SeedPathSplit = list(numpy.array_split(
                                SeedPath, 8))
                            StopDictRepeat = list(
                                repeat(ResultDictionaryNum - 2, 8))
                            #pooler=mp.Pool(4)
                            with open(
                                    '{0}_Paths_{1}Rxns.csv'.format(
                                        userinputname, StopDictNum), 'a'
                            ) as fp:  #originally 'w' - but append for looping through shorter path lengths
                                for result in pooler.starmap(
                                        AddOnPathParallel3,
                                        zip(SeedPathSplit, StopDictRepeat)):
                                    result.to_csv(fp,
                                                  index=False,
                                                  header=False)

                            pooler.close()
                            pooler.join()
                            gc.collect()

                        except:

                            #print('Error 2- Some of these paths cannot be connected, try another isotopologue if no results are written to csv [meaning no paths could be connected]')

                            try:
                                #print('Even Smaaaalller - 4')
                                #Try smaller splits
                                SeedPathSplit = list(
                                    numpy.array_split(SeedPath, 4))
                                StopDictRepeat = list(
                                    repeat(ResultDictionaryNum - 2, 4))
                                #pooler=mp.Pool(4)
                                with open(
                                        '{0}_Paths_{1}Rxns.csv'.format(
                                            userinputname, StopDictNum), 'a'
                                ) as fp:  #originally 'w' - but append for looping through shorter path lengths
                                    for result in pooler.starmap(
                                            AddOnPathParallel3,
                                            zip(SeedPathSplit,
                                                StopDictRepeat)):
                                        result.to_csv(fp,
                                                      index=False,
                                                      header=False)

                                pooler.close()
                                pooler.join()
                                gc.collect()

                            except:

                                try:
                                    #print('Tiny! - 2')
                                    #Try smaller split
                                    SeedPathSplit = list(
                                        numpy.array_split(SeedPath, 2))
                                    StopDictRepeat = list(
                                        repeat(ResultDictionaryNum - 2, 2))
                                    #pooler=mp.Pool(2)
                                    with open(
                                            '{0}_Paths_{1}Rxns.csv'.format(
                                                userinputname,
                                                StopDictNum), 'a'
                                    ) as fp:  #originally 'w' - but append for looping through shorter path lengths
                                        for result in pooler.starmap(
                                                AddOnPathParallel3,
                                                zip(SeedPathSplit,
                                                    StopDictRepeat)):
                                            result.to_csv(fp,
                                                          index=False,
                                                          header=False)
                                    pooler.close()
                                    pooler.join()
                                    gc.collect()

                                except:

                                    #print('Error 3 - Some of these paths cannot be connected, try another isotopologue if no results are written to csv [meaning no paths could be connected]')

                                    try:
                                        #print('No Parallel')
                                        Output = BuildPathsCoreNonParallel(
                                            ResultDictionaryNum - 2, SeedPath,
                                            metabname)

                                        with open(
                                                '{0}_Paths_{1}Rxns.csv'.format(
                                                    userinputname,
                                                    StopDictNum), 'a'
                                        ) as fp:  #originally 'w' - but append for looping through shorter path lengths
                                            Output.to_csv(fp,
                                                          index=False,
                                                          header=False)

                                        pooler.close()
                                        pooler.join()
                                        gc.collect()

                                    except:

                                        print('Paths cannot be built')
                                        pooler.close()
                                        pooler.join()
                                        gc.collect()

                                        return

            except:
                print('Isotopomer Failed')
                pooler.close()
                pooler.join()
                gc.collect()
                return
Esempio n. 8
0
def main():
    """
    Generates programs. These are the basic steps performed:

    D = {}
    for 1 <= i <= max_train_len:
       1. P = Generate programs of length i
       2. E = Generate examples for the generated programs
       3. Discard programs in P that are equivalent to any program in D
       4. D += (P, E)

    for j in test_lengths:
      Sample num_test programs
      Discard all programs of equal length in D which are equivalent.

    Note:
        1. Step 3 of the first greatly increases the richness of the dataset. We ensure this way that
           our programs aren't likely to have shorter equivalents.
        2. It is recommended to use --cache to load a dataset cache. The algorithm then continues generating
           for lengths larger than the maximum length of the cache. This allows incremental dataset generation and
           also helps with the generation of shorter programs where generation is slow due to randomness. Furthermore,
           we can (and should!) have virtually all programs of length <=3, to ensure our dataset is meaningful.
        3. During test sampling we only compare to programs of equivalent lengths for efficiency. This is since
           our data generation algorithm already ensures that for all longer and shorter programs there is no
           equivalence.
        4. Since the pruning is done after program generation, rather than during, the number of programs generated
           in each iteration is NOT args.num_train. This is done purely due to implementation details: it is
           challenging to discard whilst generating since it would require all processes to write and read from
           the same dictionary in parallel. However, this is a good feature for the future, to avoid having to
           try multiple values for num_train via trial-and-error.
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('--num_train', type=int, required=True)
    parser.add_argument('--num_test', type=int, required=True)
    parser.add_argument('--train_output_path', type=str, required=True)
    parser.add_argument('--test_output_path', type=str, required=True)
    parser.add_argument('--max_train_len', type=int, required=True)
    parser.add_argument('--min_train_len', default=5, type=int,
                        required=False)  # me
    parser.add_argument('--test_lengths',
                        type=str,
                        required=True,
                        help="List of test lengths to generate")
    parser.add_argument('--num_workers', type=int, default=8)
    parser.add_argument('--num_examples',
                        type=int,
                        default=params.num_examples)
    parser.add_argument(
        '--num_example_tries',
        type=int,
        default=200,
        help='total amount of tries to generate examples to try to generate')
    parser.add_argument(
        '--cache',
        type=str,
        default=None,
        help="Dataset cache from which to continue generating programs")
    args = parser.parse_args()

    test_lens = set([int(x) for x in args.test_lengths.split()])

    if args.min_train_len != -1:
        examples = {}
        min_len = args.min_train_len - 1  # as following loops start from +1
    else:
        if args.cache:
            examples = load_cache(args.cache)
            min_len = max([len(k) for k in examples])
        else:
            examples = {}
            min_len = 0

    for program_len in range(min_len + 1, args.max_train_len + 1):
        num_programs = args.num_train + args.num_test
        if program_len in KNOWN_TRAIN_SIZES:
            num_programs = min(num_programs, KNOWN_TRAIN_SIZES[program_len])

        print("Generating programs of length %d (current dataset size: %d)" %
              (program_len, len(examples)))
        new_examples = gen_programs(program_len, num_programs, args)

        existing_programs = list(examples.keys())
        counter = multiprocessing.Value('i', 0)
        new_programs = list(new_examples.keys())
        discard_pool = multiprocessing.Pool(
            processes=args.num_workers,
            initializer=init_discard_identical_worker,
            initargs=(existing_programs, counter, len(new_programs)))
        new_program_parts = [
            new_programs[i::args.num_workers] for i in range(args.num_workers)
        ]

        new_example_parts = [{p: new_examples[p]
                              for p in programs}
                             for programs in new_program_parts]
        res = discard_pool.map(discard_identical_worker, new_example_parts)
        print('')
        for d in res:
            examples.update(d)

    train_programs = list(examples.keys())
    print("Finished generation. Total programs: %d" % len(train_programs))

    # Generate test programs (they're not equivalent to all shorter programs so only same length needs to be considered)
    for test_len in test_lens:
        test_programs = []
        test_candidates = [
            x for x in train_programs if len(x.statements) == test_len
        ]
        train_programs = [
            x for x in train_programs if len(x.statements) != test_len
        ]

        random.shuffle(test_candidates)
        indices_to_discard = set()
        for i, program in enumerate(test_candidates):
            if len(test_programs) >= args.num_test:
                break
            if i in indices_to_discard:
                continue

            print("\rCreating test programs for length %d... %d\\%d" %
                  (test_len, len(test_programs), args.num_test),
                  end="")

            test_programs.append(program)
            indices_to_discard.add(i)

            for j, other in enumerate(test_candidates[i + 1:]):
                if j in indices_to_discard:
                    continue
                if constraint.is_same(program, other, examples[program]):
                    indices_to_discard.add(j)
        print('')

        print("Removed %d programs" % len(indices_to_discard))
        train_programs += [
            test_candidates[i] for i in range(len(test_candidates))
            if i not in indices_to_discard
        ]

        output_path = args.test_output_path + '_' + str(test_len)
        print('Writing %d test programs to %s' %
              (len(test_programs), output_path))
        with open(output_path, 'w') as f:
            write_programs_to_file(f, test_programs, examples)

    print('Writing %d train programs to %s' %
          (len(train_programs), args.train_output_path))
    with open(args.train_output_path, 'w') as f:
        write_programs_to_file(f, train_programs, examples)
Esempio n. 9
0
    
    #Clean hideous lists
    for enz in range(len(PathDF)):
        PathDF.iloc[enz]['Circadian Enzymes'] = list(set(re.split('\,', re.sub('\\\|\'|\[|\]|\"| ', '', str(PathDF['Circadian Enzymes'][enz])))))
    
    return(PathDF)

#Read in the results of the isotopologue file of interest
rxnlength=int(re.findall('(\d+)Rxns', 'Serine M+3_Paths_9Rxns.csv')[0])
MetabDF=pd.read_csv('Serine M+3_Paths_9Rxns.csv',header=None,error_bad_lines=False, names=list(range(rxnlength*2+1)))

#Clean up serine
ParaNum = 2
PathMatrixSplit=list(np.array_split(MetabDF,ParaNum)) #this could give an error if the number of rows is <16
    
pooler=mp.Pool(ParaNum)

Call = list(np.repeat('Filter', ParaNum))
#In case it exists already
if 'Trimmed_Paths.csv' in os.listdir():
    os.remove('Trimmed_Paths.csv')

with open('Trimmed_Paths.csv','a') as fp: #originally 'w' - but append for looping through shorter path lengths
    #written out to file as its being built
        #for result in pooler.imap(MatchCircECHitsAndGetGibbsFilter,PathMatrixSplit):
        for result in pooler.starmap(MatchCircECHitsAndGetGibbs,zip(PathMatrixSplit, Call)):
            #Each result is a Pandas Object, so write it to csv
            result.to_csv(fp,index=False,header=False)

pooler.close()
pooler.join()