Beispiel #1
0
class MapParallel(PipelineBlock):
    def __init__(self, function, n_processes=None):
        self.function = _MapFunctionClosure(function)
        self.pool = Pool(processes=n_processes)

    def run(self, input_data):
        return self.pool.imap(self.function, input_data, chunksize=1)
Beispiel #2
0
 def _itergroundings(self, simplify=False, unsatfailure=False):
     global global_bpll_grounding
     global_bpll_grounding = self
     if self.multicore:
         pool = Pool(maxtasksperchild=1)
         try:
             for gndresult in pool.imap(with_tracing(create_formula_groundings), self.formulas):
                 for fidx, stat in gndresult:
                     for (varidx, validx, val) in stat: 
                         self._varidx2fidx[varidx].add(fidx)
                         self._addstat(fidx, varidx, validx, val)
                     checkmem()
                 yield None
         except CtrlCException as e:
             pool.terminate()
             raise e
         pool.close()
         pool.join()
     else:
         for gndresult in imap(create_formula_groundings, self.formulas):
             for fidx, stat in gndresult:
                 for (varidx, validx, val) in stat: 
                     self._varidx2fidx[varidx].add(fidx)
                     self._addstat(fidx, varidx, validx, val)
             yield None
Beispiel #3
0
 def _itergroundings(self, simplify=True, unsatfailure=True):
     # generate all groundings
     if not self.formulas:
         return
     global global_fastConjGrounding
     global_fastConjGrounding = self
     batches = list(rndbatches(self.formulas, 20))
     batchsizes = [len(b) for b in batches]
     if self.verbose:
         bar = ProgressBar(width=100, steps=sum(batchsizes), color='green')
         i = 0
     if self.multicore:
         pool = Pool()
         try:
             for gfs in pool.imap(with_tracing(create_formula_groundings), batches):
                 if self.verbose:
                     bar.inc(batchsizes[i])
                     bar.label(str(cumsum(batchsizes, i + 1)))
                     i += 1
                 for gf in gfs: yield gf
         except Exception as e:
             logger.error('Error in child process. Terminating pool...')
             pool.close()
             raise e
         finally:
             pool.terminate()
             pool.join()
     else:
         for gfs in imap(create_formula_groundings, batches):
             if self.verbose:
                 bar.inc(batchsizes[i])
                 bar.label(str(cumsum(batchsizes, i + 1)))
                 i += 1
             for gf in gfs: yield gf
Beispiel #4
0
class FilterParallel(PipelineBlock):
    def __init__(self, function, n_process=None):
        self.function = self._construct_filter_function(function)
        self.pool = Pool(processes=n_process)

    def _construct_filter_function(self, function):
        return _FilterFunctionClosure(function)

    def run(self, input_data):
        return self.pool.imap(self.function, input_data, chunksize=1)
Beispiel #5
0
def crawl():
    pool = Pool(cpu_count() - 2)
    image_list, num_images = load_image_list(args.list_file)
    print 'Loaded {} images'.format(num_images)
    cleaned_image_list, cleaned_num_images = clean_image_list(image_list)
    print '{} images to crawl'.format(cleaned_num_images)
    pbar = get_progress_bar(cleaned_num_images)

    for i, _ in enumerate(pool.imap(crawl_job, cleaned_image_list), 1):
        pbar.update(i)
    pbar.finish()
    Image.save_image_list(image_list, args.image_cache)
    Landmark.save_all(args.landmark_cache)
    logging.info('All done')
Beispiel #6
0
def main():
    print('Starting.')
    args = parse_args()
    pool = Pool()
    runs = find_runs(args.source_folder, args.target_folder)
    runs = report_source_versions(runs)
    samples = read_samples(runs)
    # noinspection PyTypeChecker
    results = pool.imap(partial(compare_sample,
                                scenarios_reported=Scenarios.OTHER_CONSENSUS_CHANGED),
                        samples,
                        chunksize=50)
    scenario_summaries = defaultdict(list)
    i = None
    all_consensus_distances = []
    report_count = 0
    for i, (report, scenarios, consensus_distances) in enumerate(results):
        if report:
            report_count += 1
            if report_count > 100:
                break
        print(report, end='')
        all_consensus_distances.extend(consensus_distances)
        for key, messages in scenarios.items():
            scenario_summaries[key] += scenarios[key]
    for key, messages in sorted(scenario_summaries.items()):
        if messages:
            sample_names = {message.split()[0] for message in messages}
            summary = [key, len(messages), 'changes']
            body = ''.join(messages).rstrip('.')
            if body:
                summary.extend(['in', len(sample_names), 'samples'])
            print(*summary, end='.\n')
            print(body, end='')

    distance_data = pd.DataFrame(all_consensus_distances)
    non_zero_distances = distance_data[distance_data['distance'] != 0]
    region_names = sorted(non_zero_distances['region'].unique())
    names_iter = iter(region_names)
    for page_num, region_group in enumerate(zip_longest(names_iter, names_iter, names_iter), 1):
        group_distances = distance_data[distance_data['region'].isin(region_group)]
        plot_distances(group_distances,
                       'consensus_distances_{}.svg'.format(page_num),
                       'Consensus Distances Between Previous and v' + MICALL_VERSION)
        plot_distances(group_distances,
                       'consensus_diffs_{}.svg'.format(page_num),
                       'Consensus Differences Between Previous and v' + MICALL_VERSION,
                       'pct_diff')
    print('Finished {} samples.'.format(i))
Beispiel #7
0
class FoldParallel(PipelineBlock):
    def __init__(self, function, n_process=None):
        self.function = function
        self.pool = Pool(processes=n_process)

    def _construct_fold_function(self, function):
        return _FoldFunctionClosure(function)

    def run(self, input_data):
        batch_function = self._construct_fold_function(self.function)
        return self._fold_stream(self.pool.imap(batch_function, input_data, chunksize=1))

    def _fold_stream(self, input_data):
        input_iter = iter(input_data)
        x = next(input_iter)
        for element in input_iter:
            x = self.function(x, element)
        return x
    def _train_base(self, compute_vector, entity_word_seqs):
        pool = Pool()

        entities = {}
        vectors = []

        def idx_seqs():
            for idx, (entity, seq) in enumerate(entity_word_seqs):
                entities[entity] = idx
                yield seq

        for vec in pool.imap(compute_vector, idx_seqs()):
            vectors.append(vec)

            if len(vectors) % 1000 == 0:
                logging.info("Computed %d vectors", len(vectors))

        self.entities = entities
        self.vectors = np.asarray(vectors)
Beispiel #9
0
def raw_line_map(
    filename, line_length, func, start=0, stop=-1, threads=1, pass_teletext=True, pass_rejects=False, show_speed=True
):

    if show_speed:
        s = SpeedMonitor()

    if threads > 0:
        p = Pool(threads)
        map_func = lambda x, y: p.imap(x, y, chunksize=1000)
    else:
        map_func = itertools.imap

    for l in map_func(func, raw_line_reader(filename, line_length, start, stop)):
        if show_speed:
            s.tally(l.is_teletext)
        if l.is_teletext:
            if pass_teletext:
                yield l
        else:
            if pass_rejects:
                yield l
Beispiel #10
0
    try:
        first = int(sys.argv[2], 10)
        count = int(sys.argv[3], 10)
        skip = int(sys.argv[4], 10)
    except:
        first = 0
        count = 10000000
        skip = 1


    if not os.path.isdir(path+'/t42/'):
        os.makedirs(path+'/t42/')


    if 1:
        p = Pool(multiprocessing.cpu_count())
        it = p.imap(process_file, list_files(path+'/vbi/', path+'/t42/', first, count, skip), chunksize=1)
        for i in it:
            pass

    else: # single thread mode for debugging
        def doit():
            map(process_file, list_files(path+'/vbi/', path+'/t42/', first, count, skip))
        cProfile.run('doit()', 'myprofile')
        p = pstats.Stats('myprofile')
        p.sort_stats('cumulative').print_stats(50)



Beispiel #11
0
    def featurize_dataset(self, dataset: Dataset):
        logger.info(
            f"Loading dataset {dataset.key} and {self.split.key} split")
        data = dataset.load_x()
        for required_field in ['product', 'substrates']:
            if required_field not in data:
                raise NotImplementedError(
                    f"Need to have field '{required_field} in the dataset")

        split = self.split.load(dataset.dir)
        feat_dir = self.dir(dataset.feat_dir)

        metadata = dataset.load_metadata()
        reaction_type_given = False
        if 'reaction_type_id' in metadata:
            rtypes = metadata['reaction_type_id'].values
            ntypes = len(np.unique(rtypes))
            logger.info(f'Found {ntypes} unique reaction types in the dataset')
            reaction_type_given = True
            data['reaction_type'] = rtypes

        if not os.path.exists(feat_dir):
            os.makedirs(feat_dir)

        if 'max_n_nodes' in dataset.meta_info:
            max_n_nodes = dataset.meta_info['max_n_nodes']
        else:
            max_n_nodes = 1024
        logger.info("Max. number of nodes: {}".format(max_n_nodes))

        # we do not featurize test set for training
        all_inds = np.argwhere(split['test'] == 0).flatten()

        # shuffle indices for featurization in multiple threads
        np.random.shuffle(all_inds)

        data_len = len(data)
        samples_len = data_len * self.max_n_steps

        chunk_size = int(len(all_inds) / self.n_jobs)
        chunk_ends = [chunk_size * i for i in range(self.n_jobs + 1)]
        chunk_ends[-1] = len(all_inds)
        chunk_inds = [
            all_inds[chunk_ends[i]:chunk_ends[i + 1]]
            for i in range(len(chunk_ends) - 1)
        ]

        logger.info(f'Finding all possible values of atom and bond properties '
                    f'on {len(all_inds)} reactions using {self.n_jobs} chunks')
        parallel_args = []
        for i, ch_inds in enumerate(chunk_inds):
            new_x = dict((k, x.values[ch_inds]) for k, x in data.items())
            parallel_args.append((i, new_x, tqdm))

        prop_dict = {'atom': {}, 'bond': {}}
        if self.n_jobs == 1:
            chunk_results = [find_properties_parallel(parallel_args[0])]
        else:
            pool = Pool(self.n_jobs)
            chunk_results = pool.imap(find_properties_parallel, parallel_args)

        for chunk_prop_dict in chunk_results:
            for type_key in prop_dict.keys():
                for key, values in chunk_prop_dict[type_key].items():
                    if key not in prop_dict[type_key]:
                        prop_dict[type_key][key] = set()
                    prop_dict[type_key][key].update(values)

        # add some 'special' atom/bond feature values
        prop_dict['atom']['is_supernode'].update([0, 1])
        prop_dict['atom']['is_edited'].update([0, 1])
        prop_dict['atom']['is_reactant'].update([0, 1])
        prop_dict['bond']['bond_type'].update(['supernode', 'self'])
        prop_dict['bond']['is_edited'].update([0, 1])

        atom_feat_counts = ', '.join([
            '{:s}: {:d}'.format(key, len(values))
            for key, values in prop_dict['atom'].items()
        ])
        logger.info(f'Found atom features: {atom_feat_counts}')

        bond_feat_counts = ', '.join([
            '{:s}: {:d}'.format(key, len(values))
            for key, values in prop_dict['bond'].items()
        ])
        logger.info(f'Found bond features: {bond_feat_counts}')

        # make a dictionary for conversion of atom/bond features to OH numbers
        prop2oh = {'atom': {}, 'bond': {}}
        props = {'atom': {}, 'bond': {}}
        for type_key, prop_values in prop_dict.items():
            for prop_key, values in prop_values.items():
                sorted_vals = list(
                    sorted(values,
                           key=lambda x: x if isinstance(x, int) else 0))
                props[type_key][prop_key] = sorted_vals
                oh = dict((k, i + 1) for i, k in enumerate(sorted_vals))
                prop2oh[type_key][prop_key] = oh

        # save 'prop2oh' dictionary
        with open(get_prop2oh_vocab_path(feat_dir), 'w') as fp:
            json.dump(
                {
                    'atom': props['atom'],
                    'bond': props['bond'],
                    'atom_2oh': prop2oh['atom'],
                    'bond_2oh': prop2oh['bond']
                },
                fp,
                indent=2)

        atom_feature_keys = [
            k for k in ORDERED_ATOM_OH_KEYS if k in prop2oh['atom']
        ]
        bond_feature_keys = [
            k for k in ORDERED_BOND_OH_KEYS if k in prop2oh['bond']
        ]
        action_vocab = {
            'prop2oh': prop2oh,
            'atom_feature_keys': atom_feature_keys,
            'bond_feature_keys': bond_feature_keys,
            'atom_feat_ind': dict(
                (k, i) for i, k in enumerate(atom_feature_keys)),
            'bond_feat_ind': dict(
                (k, i) for i, k in enumerate(bond_feature_keys))
        }

        parallel_args = []
        chunk_save_paths = []
        for i, ch_inds in enumerate(chunk_inds):
            new_x = dict((k, x.values[ch_inds]) for k, x in data.items())
            is_train = split['train'][ch_inds].values
            chunk_save_path = os.path.join(feat_dir, f'chunk_result_{i}')
            chunk_save_paths.append(chunk_save_path)
            parallel_args.append(
                (i, samples_len, ch_inds, new_x, max_n_nodes, tqdm,
                 self.max_n_steps, is_train, reaction_type_given, self.forward,
                 self.action_order, action_vocab, chunk_save_path))

        logger.info(
            f'Featurizing {len(all_inds)} reactions with {self.n_jobs} threads'
        )
        logger.info(f"Number of generated paths (train+valid): {data_len}")
        logger.info(
            f"Upper bound for number of generated samples: {samples_len} ({data_len} * {self.max_n_steps})"
        )

        if self.n_jobs == 1:
            chunk_results = [featurize_parallel(parallel_args[0])]
        else:
            # leave one job for merging results
            pool = Pool(max(self.n_jobs - 1, 1))
            chunk_results = pool.imap(featurize_parallel, parallel_args)

        logger.info(f"Merging featurized data from {self.n_jobs} chunks")

        nodes_mat = sparse.csr_matrix(([], ([], [])),
                                      shape=(samples_len, max_n_nodes))
        adj_mat = sparse.csr_matrix(([], ([], [])),
                                    shape=(samples_len, max_n_nodes**2))

        n_sample_data = 6 if reaction_type_given else 5
        sample_data_mat = sparse.csr_matrix(([], ([], [])),
                                            shape=(samples_len, n_sample_data))
        meta = []

        # vocabulary of actions
        actions_vocab = []
        action2ind = {}
        action_inds = []
        action_tuples = []
        sample_inds = []

        for ch_inds, result_code, chunk_save_path in tqdm(
                zip(chunk_inds, chunk_results, chunk_save_paths),
                desc='merging reactions from chunks',
                total=self.n_jobs):
            sample_data_path = os.path.join(chunk_save_path, 'sample_data.npz')
            sample_data_mat += sparse.load_npz(sample_data_path)

            nodes_mat_path = os.path.join(chunk_save_path, 'nodes_mat.npz')
            nodes_mat += sparse.load_npz(nodes_mat_path)

            adj_mat_path = os.path.join(chunk_save_path, 'adj_mat.npz')
            adj_mat += sparse.load_npz(adj_mat_path)

            meta_save_path = os.path.join(chunk_save_path, 'metadata.csv')
            chunk_meta = pd.read_csv(meta_save_path)
            meta.append(chunk_meta)

            actions_save_path = os.path.join(chunk_save_path, 'actions.txt')
            chunk_action_tuples = []
            for line in open(actions_save_path, 'r'):
                action = eval(line.strip())
                chunk_action_tuples.append(action)

            for sample_ind, action in chunk_action_tuples:
                if action in action2ind:
                    action_inds.append(action2ind[action])
                else:
                    action_ind = len(actions_vocab)
                    action2ind[action] = action_ind
                    actions_vocab.append(action)
                    action_tuples.append(action)
                    action_inds.append(action_ind)
                sample_inds.append(sample_ind)

            # remove temporary chunk files
            shutil.rmtree(chunk_save_path)
            logger.info(
                f"Merged chunk {len(meta)} (unparsed samples: {result_code}/{len(ch_inds)})"
            )

        logger.info("Concatenating metadata")
        meta = pd.concat(meta)

        logger.info("Saving found actions")
        sample_data_mat[sample_inds, 0] = action_inds
        with open(get_actions_vocab_path(feat_dir), 'w') as fp:
            json.dump(action_tuples, fp)
        logger.info(f"Found {len(action_tuples)} reaction actions")

        n_samples = meta['n_samples']
        logger.info(
            f"Number of steps: max: {np.max(n_samples)}, avg: {np.mean(n_samples)}"
        )

        logger.info("Saving featurized data")
        meta.to_csv(get_metadata_path(feat_dir))
        sparse.save_npz(get_sample_data_path(feat_dir), sample_data_mat)
        sparse.save_npz(get_nodes_path(feat_dir), nodes_mat)
        sparse.save_npz(get_adj_path(feat_dir), adj_mat)

        n_saved_reacs = len(np.unique(meta['reaction_ind']))

        logger.info(
            f"Saved {n_saved_reacs}/{len(all_inds)} reactions ({n_saved_reacs / len(all_inds) * 100}%)"
        )
        logger.info(
            f"Saved {len(meta)} paths (avg. {len(meta) / n_saved_reacs} paths per reaction)"
        )

        logger.info("Saving featurization metadata")
        meta_info = {
            'description':
            'Graph representation of molecules with discrete node and edge features for MEGAN',
            'features': ['atom', 'bond'],
            'features_type': ['atom', 'bond'],
            'max_n_nodes': max_n_nodes,
            'format': 'sparse'
        }
        meta_path = self.meta_info_path(dataset.feat_dir)
        with open(meta_path, 'w') as fp:
            json.dump(meta_info, fp, indent=2)
Beispiel #12
0
    def match_plans(self):
        pool = Pool(multiprocessing.cpu_count() - 1)
        plans = list(reversed(list(tqdm(pool.imap(match_plan, list(reversed(self.data))), total=len(self.data)))))

        self.data = [d.set_plan(p) for d, plans in zip(self.data, plans) for p in plans]
        return self