Beispiel #1
0
    def download(self, index_path, txt_dir):
        # Save to txt dir
        self.txt_dir = txt_dir
        if not os.path.exists(self.txt_dir):
            os.makedirs(self.txt_dir)

        # Count Total Urls to Process
        with open(index_path, 'r') as fin:
            num_urls = sum(1 for line in fin)

        def iter_path_generator(index_path):

            with open(index_path, 'r') as fin:
                reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='\"',
                                    quoting=csv.QUOTE_ALL)
                for url_idx, row in enumerate(reader, 1):
                    form_type, company_name, cik, date_filed, filename = row
                    url = os.path.join(SEC_GOV_URL,
                                       filename).replace("\\", "/")
                    yield (url_idx, url)

        def download_job(obj):
            url_idx, url = obj

            fname = '_'.join(url.split('/')[-2:])

            fname, ext = os.path.splitext(fname)
            htmlname = fname + '.html'

            text_path = os.path.join(self.txt_dir, fname + '.txt')

            if os.path.exists(text_path):
                print("Already exists, skipping {}...".format(url))
                sys.stdout.write("\033[K")
            else:
                print("Total: {}, Downloading & Parsing: {}...".format(
                    num_urls, url_idx))
                sys.stdout.write("\033[K")

                r = requests.get(url)
                try:
                    # Parse html with Beautiful Soup
                    soup = BeautifulSoup(r.content, "html.parser")
                    text = soup.get_text("\n")

                    # Process Text
                    text = self._process_text(text)
                    text_path = os.path.join(self.txt_dir, fname + '.txt')

                    # Write to file
                    with codecs.open(text_path, 'w', encoding='utf-8') as fout:
                        fout.write(text)
                except BaseException as e:
                    print("{} parsing failed: {}".format(url, e))

        ncpus = cpu_count() if cpu_count() <= 8 else 8
        pool = ProcessPool(ncpus)
        pool.map(download_job, iter_path_generator(index_path))
Beispiel #2
0
def mlp():
    if not os.path.exists(tar_dir):
        os.makedirs(tar_dir)

    iterator = glob(os.path.join(src_dir, '*.txt'))

    ncpus = cpu_count() if cpu_count() <= 8 else 8
    pool = ProcessPool(ncpus)
    pool.map(preprocess_job, iterator)
Beispiel #3
0
    def test00_stress(self):
        ids = range(self.client_count)
        pool = Pool(nodes=self.client_count)

        # invoke queries
        pool.map(query_crud, graphs, ids)

        # make sure we did not crashed
        conn = self.env.getConnection()
        conn.ping()
        conn.close()
 def test05_index_delete(self):
     def create_drop_index(graph_id):
         env = Env(decodeResponses=True)
         redis_con = env.getConnection()
         for _ in range(1, 100):
             pipe = redis_con.pipeline()
             pipe.execute_command("GRAPH.QUERY", f"x{graph_id}", "CREATE (a:L), (n:L), (n)-[:T]->(a)")
             pipe.execute_command("GRAPH.QUERY", f"x{graph_id}", "CREATE INDEX FOR ()-[n:T]-() ON (n.p)")
             pipe.execute()
             redis_con.execute_command("GRAPH.DELETE", f"x{graph_id}")
     pool = Pool(nodes=10)
     pool.map(create_drop_index, range(1, 100))
Beispiel #5
0
    def fix_blob_paths(self):
        num_workers = multiprocessing.cpu_count() - 1
        logger.debug(
            "Processing blob paths with a process pool of {0} nodes".format(
                num_workers))
        pool = ProcessPool(num_workers)

        for origin_prefix, target_prefix in self.blobs_that_need_fixing.iteritems(
        ):
            blobs_under_prefix = self.block_blob_service.list_blobs(
                self.container_name, prefix=origin_prefix)
            arguments = [(origin_prefix, target_prefix, blob)
                         for blob in blobs_under_prefix]
            pool.map(self._fix_blob_path, arguments)
Beispiel #6
0
    def get_city_states(self):
        """
        Creates city states from start time to end time
        :param:
        :return:
        """
        city_states = []
        start_time = self.start_time
        end_time = self.end_time

        # Create array of time slice values between the start and end time
        business_days = self.config['city_state_creator']['business_days']
        business_hours_start = self.config['city_state_creator'][
            'business_hours_start']
        business_hours_end = self.config['city_state_creator'][
            'business_hours_end']
        index = pd.date_range(start=start_time,
                              end=end_time,
                              freq=str(self.time_unit_duration) + 'min')

        # Filter only the required days and hours
        index = index[index.day_name().isin(business_days)]
        index = index[(index.hour >= business_hours_start)
                      & (index.hour <= business_hours_end)]
        time_slice_starts = index - timedelta(
            minutes=self.time_slice_duration / 2)
        time_slice_ends = index + timedelta(minutes=self.time_slice_duration /
                                            2)

        # Create arguments dictionary for parallelization
        self.parallel_args = self.create_parallel_args(index,
                                                       time_slice_starts,
                                                       time_slice_ends)

        # Create city states
        manager = Manager()
        city_states = manager.dict()
        N = len(index.values)

        # Create parallel pool
        self.logger.info("Creating parallelization pool")
        pool = ProcessPool(nodes=25)
        pool.map(self.get_city_state, ([city_states, t] for t in xrange(N)))
        pool.close()
        pool.join()
        pool.clear()
        self.logger.info("Finished creating city states")

        return dict(city_states)
    def retrieve_fields(self):

        ip, reqlink, reqtype, response, virtualm, keytemp = [], [], [], [], [], []
        bytes, avg_time, count, uniq_vis, total_vis = 0, 0, 0, 0, len(self.keys)
        for key in self.keys:
            keytemp.append(str(key))
        # creating an ordered dictionary containing log data retrieved from column_family
        log = self.cass_conn.multiget(keytemp)
        # starting a pool of 5 worker processes
        pool = Pool()
        pool.ncpus = 5
        for item in log.values():
            # appending lists with their respective values   
            ip.append(item['host']), reqlink.append(item['request_link']), reqtype.append(item['request_type']), response.append(str(item['response_code'])), virtualm.append(item['virtual_machine'])     
            if item['byte_transfer'] != '-':
                bytes += item['byte_transfer']
            if item['response_time'] != '-':
                avg_time += item['response_time']
                count += 1
        avg_time = avg_time/count
        # using the pool of workers to get results
        results = pool.map(self.unique_count, [ip, reqtype, reqlink, response, virtualm])
        pool.close()
        pool.join()
        uniq_vis = len(results[0][0])
        return self.time, results[0][0], results[0][1], results[1][0], results[1][1], results[2][0], results[2][1], results[3][0], results[3][1], results[4][0], results[4][1], bytes, avg_time, uniq_vis, total_vis
def pathos_mp_batch_evaluator(
    func,
    arguments,
    n_cores=N_CORES,
    error_handling="continue",
    unpack_symbol=None,
):
    """Batch evaluator based on pathos.multiprocess.ProcessPool

    This uses a patched but older version of python multiprocessing that replaces
    pickling with dill and can thus handle decorated functions.

    Args:
        func (Callable): The function that is evaluated.
        arguments (Iterable): Arguments for the functions. Their interperation
            depends on the unpack argument.
        n_cores (int): Number of cores used to evaluate the function in parallel.
            Value below one are interpreted as one. If only one core is used, the
            batch evaluator disables everything that could cause problems, i.e. in that
            case func and arguments are never pickled and func is executed in the main
            process.
        error_handling (str): Can take the values "raise" (raise the error and stop all
            tasks as soon as one task fails) and "continue" (catch exceptions and set
            the output of failed tasks to the exception object without raising it.
            KeyboardInterrupt and SystemExit are always raised.
        unpack_symbol (str or None). Can be "**", "*" or None. If None, func just takes
            one argument. If "*", the elements of arguments are positional arguments for
            func. If "**", the elements of arguments are keyword arguments for func.


    Returns:
        list: The function evaluations.

    """
    if not pathos_is_available:
        raise NotImplementedError(
            "To use the pathos_mp_batch_evaluator, install pathos with "
            "conda install -c conda-forge pathos.")

    _check_inputs(func, arguments, n_cores, error_handling, unpack_symbol)
    n_cores = int(n_cores)

    reraise = error_handling == "raise"

    @unpack(symbol=unpack_symbol)
    @catch(default="__traceback__", reraise=reraise)
    def internal_func(*args, **kwargs):
        return func(*args, **kwargs)

    if n_cores <= 1:
        res = [internal_func(arg) for arg in arguments]
    else:
        p = ProcessPool(nodes=n_cores)
        try:
            res = p.map(internal_func, arguments)
        except Exception as e:
            p.terminate()
            raise e

    return res
Beispiel #9
0
def get_full_content(json_data, num_cores):

	def param_generator(json_data):
		for data in json_data:
			yield data['id'], data['url']

	def tag_and_write_job(param):
		num_x, news_url = param
		logging.info("Processing news #{}: {}".format(num_x, news_url))

		news_content = GetUrlContent(news_url)

		## store all news (might be used for word2vec) ##
		with open('OpinionAnalysis/data/news_corpus.txt', 'a') as fp:
			fp.write('*\n')
			fp.write(news_content)

		return {'id':num_x, 'content':news_content}

	pool = ProcessPool(num_cores)
	new_json_data = pool.map(tag_and_write_job, param_generator(json_data))

	df = pd.DataFrame(json_data)
	df_new = pd.DataFrame(new_json_data)
	ret_df = df.merge(df_new, left_on='id', right_on='id')
	ret_json = json.loads(ret_df.to_json(orient='records'))

	return ret_json
Beispiel #10
0
def load_data_from_files_raw(
    data_files: Iterable[Path],
    # humm that is not very nice type signature... need to create interface for that
    parse_callback: Callable[..., Tuple[str, int, Iterable[T_Single]]],  # type: ignore
    parallelize: bool,
    *args,
) -> Dict[str, Tuple[int, Iterable[T_Single]]]:
    tasks_as_args = [[data_file, *args] for data_file in data_files]

    if parallelize:
        pool = ProcessPool()

        # needed that hack to work... issues with serialization of classes
        # doesn't work with basic multiprocessing so needed pathos
        def cb(x):
            return parse_callback(*x)

        per_file_results = list(pool.map(cb, tasks_as_args))
    else:
        per_file_results = [parse_callback(*task_args) for task_args in tasks_as_args]  # type: ignore

    lang_samples_iter: Dict[str, Tuple[int, List[Iterable[T_Single]]]] = {}
    for (lang, lg, samples_iter) in per_file_results:
        if lang not in lang_samples_iter:
            lang_samples_iter[lang] = (0, [])
        (lg0, iters) = lang_samples_iter[lang]
        iters.append(samples_iter)
        lang_samples_iter[lang] = (lg0 + lg, iters)

    lang_samples: Dict[str, Tuple[int, Iterable[T_Single]]] = {}
    for (lang, (lg, iters)) in lang_samples_iter.items():
        lang_samples[lang] = (lg, itertools.chain(*iters))

    return lang_samples
Beispiel #11
0
    def update_qfunction(self):
        if self.TWIN_Q:
            self.i = (self.i + 1) % 2

        if self.theta_q is None:  # generate critic network if none exist
            n = len(self.state_action_basis(self.state, self.action))
            if self.TWIN_Q:
                m = 2  # generate 2 q networks
            else:
                m = 1
            self.theta_q = np.random.normal(0, 0.3, (n, m))
        self.q_predicted = self.theta_q[:, self.
                                        i] @ self.xu_k  # recorded for analysis
        self.q_observed = self.r + self.BETA * self.theta_q[:, self.
                                                            i] @ self.xu_k1  # recorded for analysis

        if len(self.memory) > self.BATCH_SIZE:
            batch = random.sample(self.memory, self.BATCH_SIZE)

            pool = ProcessPool(nodes=self.config['simulation']['n_nodes'])
            batch_y = np.array(pool.map(self.process_exp, batch))
            batch_phi = np.array([
                self.state_action_basis(exp['state'], exp['action'])
                for exp in batch
            ])

            clf = Ridge(alpha=0.01)
            clf.fit(batch_phi, batch_y)
            temp_theta = clf.coef_
            self.theta_q[:, self.i] = self.ALPHA_q * temp_theta + (
                1 - self.ALPHA_q) * self.theta_q.flatten()
Beispiel #12
0
    def gen_operator_data(self, space, Nx, M, num, representation):
        print("Generating operator data...", flush=True)
        features = space.random(num)
        # Generate outputs
        x = np.linspace(0, self.T, num=self.Nx)[:, None]
        sensor_values = self.random_process(space.eval_u(features, x,
                                                         self.M))  # exp(b)
        p = ProcessPool(nodes=config.processes)
        s_values = np.array(p.map(self.eval_s, sensor_values))

        # Generate inputs
        sensors = np.linspace(0, self.T, num=Nx)[:, None]
        if representation == "samples":
            sensor_values = self.random_process(
                space.eval_u(features, sensors, M))
        elif representation == "KL":
            sensor_values = space.eval_KL_bases(features, sensors, M)
            # sensor_values = self.random_process(sensor_values)
        res = [
            make_triple(sensor_values[i], x, s_values[i], self.npoints_output)
            for i in range(num)
        ]
        res = np.vstack(res)
        m = sensor_values.shape[1]
        return [res[:, :m], res[:, m:-1]], res[:, -1:]
Beispiel #13
0
def main(args):

    setup = experiment_setups.parse(args.setup)
    processes = setup.processes
    max_quantifier_length = setup.max_quantifier_length
    model_size = setup.model_size

    file_util = FileUtil(
        fileutil.base_dir(setup.dest_dir, setup.name, max_quantifier_length,
                          model_size))

    folderName = "{0}/{1}_length={2}_size={3}".format(setup.dest_dir,
                                                      setup.name,
                                                      max_quantifier_length,
                                                      model_size)

    processpool = ProcessPool(nodes=processes)

    expressions = file_util.load_dill('expressions.dill')

    complexities = processpool.map(
        lambda ex: setup.measure_expression_complexity(
            ex, max_quantifier_length), expressions)

    file_util.dump_dill(complexities, 'expression_complexities.dill')

    processpool.close()
    processpool.join()

    print('Complexity Measuring finished.')
Beispiel #14
0
    def count(self,
              name='e1',
              meta='count',
              nodes=None,
              debug=False,
              parallel=False):
        """
        count number of points in the neighborhood
        """
        self.estimates[name] = {}
        self.estimates[name]['vname'] = None
        self.estimates[name][meta] = meta

        if nodes is None:
            nodes = self.nodes

        def f(i):
            # update data selected around target point
            self.search.update([self.x0[i], self.y0[i], self.z0[i]])
            if debug:
                return np.sum(
                    self.search.test), self.search.row_id[self.search.test]
            else:
                return np.sum(self.search.test), None

        # apply the estimator to each target
        if parallel:
            pool = ProcessPool()
            self.estimates[name]['estimate'] = np.array(pool.map(f, nodes))
        else:
            self.estimates[name]['estimate'] = np.array(list(map(f, nodes)))
 def land_routine(self):
     while self.took_off:
         pool = ProcessPool()
         r = pool.map(self.client_land, self.agent_names)
         rospy.loginfo('Landing responses:')
         rospy.loginfo(r)
         self.took_off = not all(r)
     return True
Beispiel #16
0
def run_concurrent(env, queries, f):
    pool = Pool(nodes=CLIENT_COUNT)

    # invoke queries
    result = pool.map(f, graphs, queries)

    # validate all process return true
    env.assertTrue(all(result))
Beispiel #17
0
    def test_04_concurrent_delete(self):
        pool = Pool(nodes=CLIENT_COUNT)

        # invoke queries
        assertions = pool.map(delete_graph, graphs)

        # Exactly one thread should have successfully deleted the graph.
        self.env.assertEquals(assertions.count(True), 1)
def run_concurrent(queries, f):
    pool = Pool(nodes=CLIENT_COUNT)
    manager = pathos_multiprocess.Manager()

    barrier = manager.Barrier(CLIENT_COUNT)
    barriers = [barrier] * CLIENT_COUNT

    # invoke queries
    return pool.map(f, queries, barriers)
Beispiel #19
0
    def download(self, index_path):

        def iter_path_generator(index_path):
            with open(index_path,'r') as fin:
                reader = csv.reader(fin,delimiter=',',quotechar='\"',quoting=csv.QUOTE_ALL)
                for row in reader:
                    form_type, company_name, cik, date_filed, filename = row
                    url = os.path.join(SEC_GOV_URL,filename)
                    yield url

        def download_job(url):
            fname = '_'.join(url.split('/')[-2:])

            fname, ext = os.path.splitext(fname)
            htmlname = fname + '.html'

            text_path = os.path.join(self.txt_dir,fname + '.txt')

            if os.path.exists(text_path):
                print("Already exists, skipping {}".format(url))
            else:
                print("Downloading & Parsing {}".format(url))

                r = requests.get(url)
                try:
                    # Parse html with Beautiful Soup
                    soup = BeautifulSoup( r.content, "html.parser" )
                    text = soup.get_text("\n")

                    # Process Text
                    text = self._process_text(text)

                    text_path = os.path.join(self.txt_dir,fname + '.txt')
                    # Write to file
                    with codecs.open(text_path,'w',encoding='utf-8') as fout:
                        fout.write(text)
                except BaseException as e:
                    print("{} parsing failed: {}".format(url,e))


        ncpus = cpu_count() if cpu_count() <= 8 else 8;
        pool = ProcessPool( ncpus )
        pool.map( download_job,
                    iter_path_generator(index_path) )
Beispiel #20
0
    def eval_s_space(self, space, features, x):
        """For a list of functions in `space` represented by `features`
        and a list `x`, compute the corresponding list of outputs.
        """
        def f(feature, xi):
            return self.eval_s(lambda t: space.eval_u_one(feature, t), xi[0])

        p = ProcessPool(nodes=config.processes)
        res = p.map(f, features, x)
        return np.array(list(res))
Beispiel #21
0
    def extract(self):
        def text_gen(txt_dir):
            # Yields markup & name
            for fname in os.listdir(txt_dir):
                if not fname.endswith('.txt'):
                    continue
                yield fname

        def parsing_job(fname):
            print("Parsing: {}".format(fname))
            # Read text
            filepath = os.path.join(self.txt_dir, fname)
            with codecs.open(filepath, 'rb', encoding='utf-8') as fin:
                text = fin.read()

            name, ext = os.path.splitext(fname)
            # Parse MDA part

            msg = ""
            mda, end = self.parse_mda(text)
            # Parse second time if first parse results in index
            if mda and len(mda.encode('utf-8')) < 1000:
                mda, _ = self.parse_mda(text, start=end)

            if mda:  # Has value
                msg = "SUCCESS"
                mda_path = os.path.join(self.mda_dir, name + '.mda')
                with codecs.open(mda_path, 'w', encoding='utf-8') as fout:
                    fout.write(mda)
            else:
                msg = msg if mda else "MDA NOT FOUND"
            print("{},{}".format(name, msg))
            return name + '.txt', msg  #

        ncpus = cpu_count() if cpu_count() <= 8 else 8
        pool = ProcessPool(ncpus)

        _start = time.time()
        parsing_failed = pool.map( parsing_job, \
                                   text_gen(self.txt_dir) )
        _end = time.time()

        print("MDA parsing time taken: {} seconds.".format(_end - _start))

        # Write failed parsing list
        count = 0
        parsing_log = 'parsing.log'
        with open(parsing_log, 'w') as fout:
            print("Writing parsing results to {}".format(parsing_log))
            for name, msg in parsing_failed:
                fout.write('{},{}\n'.format(name, msg))
                if msg != "SUCCESS":
                    count = count + 1

        print("Number of failed text:{}".format(count))
Beispiel #22
0
    def test_10_write_starvation(self):
        # make sure write query do not starve
        # when issuing a large number of read queries
        # alongside a single write query
        # we dont want the write query to have to wait for
        # too long, consider the following sequence:
        # R, W, R, R, R, R, R, R, R...
        # if write is starved our write query might have to wait
        # for all queued read queries to complete while holding
        # Redis global lock, this will hurt performance
        #
        # this test issues a similar sequence of queries and
        # validates that the write query wasn't delayed too much

        self.graph = Graph(self.conn, GRAPH_ID)
        pool = Pool(nodes=CLIENT_COUNT)

        Rq = "UNWIND range(0, 10000) AS x WITH x WHERE x = 9999 RETURN 'R', timestamp()"
        Wq = "UNWIND range(0, 1000) AS x WITH x WHERE x = 27 CREATE ({v:1}) RETURN 'W', timestamp()"
        Slowq = "UNWIND range(0, 100000) AS x WITH x WHERE (x % 73) = 0 RETURN count(1)"

        # issue a number of slow queries, this will give us time to fill up
        # RedisGraph internal threadpool queue
        queries = [Slowq] * CLIENT_COUNT * 5
        nulls = [None] * CLIENT_COUNT * 5

        # issue queries asynchronously
        pool.imap(thread_run_query, queries, nulls)

        # create a long sequence of read queries
        queries = [Rq] * CLIENT_COUNT * 10
        nulls = [None] * CLIENT_COUNT * 10

        # inject a single write query close to the begining on the sequence
        queries[CLIENT_COUNT] = Wq

        # invoke queries
        # execute queries in parallel
        results = pool.map(thread_run_query, queries, nulls)

        # count how many queries completed before the write query
        count = 0
        write_ts = results[CLIENT_COUNT]["result_set"][0][1]
        for result in results:
            row = result["result_set"][0]
            ts = row[1]
            if ts < write_ts:
                count += 1

        # make sure write query wasn't starved
        self.env.assertLessEqual(count, len(queries) * 0.3)

        # delete the key
        self.conn.delete(GRAPH_ID)
def runGOanalysis(clusters, processes=10):
    df = pd.DataFrame()
    pool = ProcessPool(nodes=processes)
    newDf = pool.map(_runGOanalysis, clusters)
    pool.close()
    pool.join()
    df = pd.concat([df, newDf], axis=0)
    geneIndex = pd.read_excel(os.path.join(goaResultDir, 'EntrezIndex.xlsx'), index_col=0)
    geneIndex = pd.concat([geneIndex, newEntrez], axis=0)
    geneIndex.to_excel(os.path.join(goaResultDir, 'EntrezIndex.xlsx'))
    return(geneIndex)
Beispiel #24
0
def liste_machines_allumees(machines):
    machinesTemp = []
    pool = Pool(nodes=10)
    machinesTemp = pool.map(recherche, machines)

    #On ne garde que les machines allumees
    machinesAllumees = []
    for val in machinesTemp:
        if val != None:
            machinesAllumees.append(val)

    return machinesAllumees
Beispiel #25
0
def main(args):
    setup = experiment_setups.parse(args.setup)

    file_util = FileUtil(
        fileutil.run_dir(setup.dest_dir, setup.name,
                         setup.max_quantifier_length, setup.model_size,
                         args.name))

    languages = language_loader.load_languages(file_util)

    universe = generator.generate_simplified_models(setup.model_size)

    pool = ProcessPool(nodes=setup.processes)

    if setup.inf_strat == 'exact':
        informativeness_measurer = InformativenessMeasurer(len(universe))
    elif setup.inf_strat == 'simmax':
        informativeness_measurer = SimMaxInformativenessMeasurer(universe)
    else:
        raise ValueError('{0} is not a valid informativeness strategy.'.format(
            setup.inf_strat))

    if setup.comp_strat == 'wordcount':
        complexity_measurer = WordCountComplexityMeasurer(setup.max_words)
    elif setup.comp_strat == 'wordcomplexity':
        complexity_measurer = SumComplexityMeasurer(setup.max_words, 1)
    else:
        raise ValueError('{0} is not a valid complexity strategy.'.format(
            setup.comp_strat))

    informativeness = pool.map(informativeness_measurer, languages)
    complexity = pool.map(complexity_measurer, languages)

    file_util.dump_dill(informativeness,
                        'informativeness_{0}.dill'.format(setup.inf_strat))
    file_util.dump_dill(complexity,
                        'complexity_{0}.dill'.format(setup.comp_strat))

    print("measure.py finished.")
Beispiel #26
0
def register_stack_to_template(frames, template, regfn, njobs=4, **fnargs):
    """
    Given stack of frames (or a FSeq obj) and a template image, 
    align every frame to template and return a list of functions,
    which take an image and return warped image, aligned to template.
    """
    if njobs > 1:
        pool = ProcessPool(nodes=njobs) 
        out = pool.map(partial(regfn, template=template, **fnargs), frames)
        #pool.close()
    else:
        out = np.array([regfn(img, template, **fnargs) for img in frames])
    return out
Beispiel #27
0
 def eval_u(self, ys, sensors):
     """For a list of functions represented by `ys`,
     compute a list of a list of function values at a list `sensors`.
     """
     if self.interp == "linear":
         return np.vstack([np.interp(sensors, np.ravel(self.x), y).T for y in ys])
     p = ProcessPool(nodes=config.processes)
     res = p.map(
         lambda y: interpolate.interp1d(
             np.ravel(self.x), y, kind=self.interp, copy=False, assume_sorted=True
         )(sensors).T,
         ys,
     )
     return np.vstack(list(res))
Beispiel #28
0
    def eval_KL_bases(self, ls, sensors, M):
        def helper(l):
            grf = GRF_KL(
                self.T,
                kernel=self.kernel,
                length_scale=l[0],
                num_eig=M,
                N=self.N,
                interp=self.interp,
            )
            return np.ravel(grf.bases(sensors))

        p = ProcessPool(nodes=config.processes)
        return np.vstack(p.map(helper, ls))
Beispiel #29
0
def main():

    # Backend can be either `spoon-core` or `gumtree-spoon`
    backend = 'spoon-core'
    pool = ProcessPool(nodes=n_jobs)

    bugs = ManySStuBs4J(DATASET).bugs
    REPAIR_RESULT.touch()

    # Retreiving already processed bugs
    with open(REPAIR_RESULT, newline='') as f:
        reader = csv.reader(f)
        processed = {(x[0], x[-1]) for x in reader}

    for bug in tqdm(bugs, total=len(bugs)):

        fixed_file = bug.fixed_file_line_dir / bug.file_name

        # If the bug is already processed
        # This is only for resume after interruption. It won't prevent duplicates.
        if (str(bug.buggy_file_line_dir), bug.bug_type) in processed:
            continue

        patch_output = REPAIR_OUTPUT / bug.buggy_file_line_dir

        if not patch_output.exists():
            continue

        try:
            with open(INPUT / fixed_file) as file:
                fixed_line = file.readlines()[bug.fix_line_num - 1].strip()
        except Exception as e:
            print(e, file=sys.stderr)
            print(INPUT / fixed_file, file=sys.stderr)
            continue

        patch_files = [path_dir / bug.file_name
                       for path_dir in sorted(patch_output.iterdir(), key=lambda x: int(x.name))]
        pfnum = len(patch_files)
        comp_res = pool.map(compare, patch_files, [bug.bug_line_num] * pfnum,
                            [fixed_line] * pfnum, [backend] * pfnum)

        patch_result = [str(bug.buggy_file_line_dir), repr(comp_res),
                        str(bug.fixed_file_line_dir), bug.file_name,
                        bug.project_name, bug.bug_type]

        with open(REPAIR_RESULT, 'a', newline='') as result_csv_file:
            csv_writer = csv.writer(result_csv_file)
            csv_writer.writerow(patch_result)
Beispiel #30
0
def multiProcessLit(genes, directory, rettype="full", processes=13):
    """
    Runs NLP on literature with Pathos parallel processing pool.

    parameters
    ----------
    genes: list of genes
    directory: directory containing literature for NLP
    processes (optional): int, number of parallel processe (default 13)
    """
    resultDirs = [
        "Sentences",
        "Categories",
        "Functions",
        "Regions",
        "CellTypes",
        "NTs",
        "Physio",
    ]
    for item in resultDirs:
        if not os.path.exists(
                os.path.join(clusterDirectory, item + "_" + cluster + "/")):
            os.mkdir(os.path.join(clusterDirectory,
                                  item + "_" + cluster + "/"))
    pathList = []
    for item in genes:
        if rettype == "full":
            fullpath = os.path.join(directory, "papers/" + item + "/")
        elif rettype == "abstract":
            fullpath = os.path.join(directory, "abstracts/" + item + "/")
        textFile = os.path.join(fullpath, "CombinedFullTexts.txt")
        pathList.append(textFile)
    pool = ProcessPool(nodes=processes)
    pool.map(multiProcessTextMinimal, pathList)
    pool.close()
    pool.join()
Beispiel #31
0
def apply_warps(warps, frames, njobs=4):
    """
    returns result of applying warps for given frames (one warp per frame)
    """
    if njobs > 1 :
        pool = ProcessPool(nodes=njobs)
        out = pool.map(parametric_warp, frames, warps)
        #pool.close()
        out = np.array(out)
    else:
        out = np.array([parametric_warp(f,w) for f,w in itt.izip(frames, warps)])
    if isinstance(frames, fseq.FrameSequence):
        out = fseq.open_seq(out)
        out.meta = frames.meta
    return out
Beispiel #32
0
def load_data_from_files(
    data_files: Iterable[Path],
    data_params: DatasetParams,
    query_tokenizer: TokenizerRecordable,
    per_code_language_tokenizers: Dict[str, TokenizerRecordable],
    # humm that is not very nice type signature... need to create interface for that
    parse_callback: Callable[[
        Path, DatasetParams, TokenizerRecordable, Dict[str,
                                                       TokenizerRecordable]
    ], Tuple[str, int, Iterable[T]]],
    parallelize: bool = True,
) -> Dict[str, Tuple[int, Iterable[T]]]:
    """
    Load data from many files using 

    Directly adapted from original repo
    """
    tasks_as_args = [[
        data_file, data_params, query_tokenizer, per_code_language_tokenizers
    ] for data_file in data_files]

    if parallelize:
        pool = ProcessPool()

        # needed that hack to work... issues with serialization of classes
        # doesn't work with basic multiprocessing so needed pathos
        def cb(x):
            return parse_callback(*x)

        per_file_results = list(pool.map(cb, tasks_as_args))
    else:
        per_file_results = [
            parse_callback(*task_args) for task_args in tasks_as_args
        ]  # type: ignore

    lang_samples_iter: Dict[str, Tuple[int, List[Iterable[T]]]] = {}
    for (lang, lg, samples_iter) in per_file_results:
        if lang not in lang_samples_iter:
            lang_samples_iter[lang] = (0, [])
        (lg0, iters) = lang_samples_iter[lang]
        iters.append(samples_iter)
        lang_samples_iter[lang] = (lg0 + lg, iters)

    lang_samples: Dict[str, Tuple[int, Iterable[T]]] = {}
    for (lang, (lg, iters)) in lang_samples_iter.items():
        lang_samples[lang] = (lg, itertools.chain(*iters))

    return lang_samples
Beispiel #33
0
def main(param2val):  # param2val will be different on each machine

    params = Params.from_param2val(param2val)
    print(params)

    research_data_path = Path(param2val['project_path']).parent
    wiki_param_path = research_data_path / 'CreateWikiCorpus' / 'runs' / params.wiki_param_name
    if not wiki_param_path.exists():
        raise FileNotFoundError('{} does not exist'.format(
            params.wiki_param_name))

    # load text file
    path_to_articles = list(wiki_param_path.glob('**/bodies.txt'))[0]

    # make generator that iterates over docs in chunks (to use with spacy.nlp.pipe)
    # note: because params.max_num_docs is the number of total docs requested across all jobs,
    # "docs_in_job" is the number of docs needed to process in this job
    docs_in_job = params.max_num_docs // params.num_machines
    f = itertools.islice(path_to_articles.open('r'), docs_in_job)
    texts = [
        doc
        for doc in zip(*(f, ) * configs.MultiProcessing.num_texts_per_process)
    ]
    num_texts = len(texts)
    print('Number of text chunks: {}'.format(num_texts))

    # count in multiple processes
    pool = ProcessPool(configs.MultiProcessing.num_workers)
    max_num_docs_per_worker = params.max_num_docs // configs.MultiProcessing.num_workers
    results = pool.map(
        make_w2dfs,
        texts,  # first arg
        [params.pos] * num_texts,  # second arg
        [max_num_docs_per_worker] * num_texts,  # third arg
        [params.min_frequency] * num_texts  # fourth arg
    )
    flat_results = [w2df for chunk in results for w2df in chunk]
    print('Num w2dfs: {}'.format(len(flat_results)))

    # save pickled w2dfs to wiki_param_path
    w2dfs_file_name = 'w2dfs_{}_{}.pkl'.format(params.max_num_docs, params.pos)
    full_path = wiki_param_path / w2dfs_file_name
    with full_path.open('wb') as f:
        pickle.dump(flat_results, f)
    print('Saved w2dfs to {}'.format(full_path))

    return [
    ]  # Ludwig package requires a list (empty, or containing pandas series objects)
Beispiel #34
0
def test_mp():
    # instantiate and configure the worker pool
    from pathos.pools import ProcessPool
    pool = ProcessPool(nodes=4)

    _result = list(map(pow, [1,2,3,4], [5,6,7,8])) 

    # do a blocking map on the chosen function
    result = pool.map(pow, [1,2,3,4], [5,6,7,8])
    assert result == _result

    # do a non-blocking map, then extract the result from the iterator
    result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8])
    result = list(result_iter)
    assert result == _result

    # do an asynchronous map, then get the results
    result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8])
    result = result_queue.get()
    assert result == _result
Beispiel #35
0
 def processcompute(self, xs):
     pool = ProcessPool(4)
     results = pool.map(self.compute, xs)
     return results
Beispiel #36
0
# Copyright (c) 2016-2017 The Uncertainty Quantification Foundation.
# License: 3-clause BSD.  The full license text is available at:
#  - http://trac.mystic.cacr.caltech.edu/project/pathos/browser/pathos/LICENSE

def host(id):
    import socket
    return "Rank: %d -- %s" % (id, socket.gethostname())


if __name__ == '__main__':
    from pathos.helpers import freeze_support
    freeze_support()

    from pathos.pools import ProcessPool as Pool
    pool = Pool()

    print("Evaluate 5 items on 2 proc:")
    pool.ncpus = 2
    res3 = pool.map(host, range(5))
    print(pool)
    print('\n'.join(res3))
    print('')

    print("Evaluate 5 items on 10 proc:")
    pool.ncpus = 10
    res5 = pool.map(host, range(5)) 
    print(pool)
    print('\n'.join(res5))

# end of file
                                                                                     fast_traps=['Phosphorus'],
                                                                                     rho_rel_err=Poisson_rel_err,
                                                                                     df_threshold=1e-2, debug=True)

    return [T, t_points, bonding_interfaces_f_t, dopants_f_t]


args = np.empty((len(T_range), 5), dtype=object)
args[:, 0] = silicon
args[:, 1] = electrode
args[:, 2] = T_range
args[:, 3] = V_p
args[:, 4] = V_rb

pool = Pool()
results = np.array(pool.map(calculate, args))
print results

sample_data_dir = join(dirname(__file__), '03-kinetics')

t1 = np.array([1e-6, 1e-5, 2e-5, 5e-5, 1e-5])
dlts_array = np.zeros((len(T_range), len(t1)), dtype=np.float)
#_, ax = plt.subplots()
for result in results:
    T, t_points, bonding_interfaces_f_t, dopants_f_t = result
    csv_name = '03_Au_nSi_BW_transient' + '_%02.2fVp_%02.2fVrb_%03.2fK' % (V_p, abs(V_rb), T) + '.csv'
    csv_name = join(sample_data_dir, csv_name)
    df = pd.DataFrame(bonding_interfaces_f_t)
    df['time'] = t_points
    df.set_index('time', inplace=True)
    df.to_csv(csv_name)
Beispiel #38
0

if __name__ is '__main__':
    from pathos.helpers import freeze_support
    freeze_support()

    from pathos.pools import ProcessPool as Pool
    from pathos.pools import ThreadPool as TPool
    pool = Pool()
    tpool = TPool()

    # test 'dilled' multiprocessing for inner
    print("Evaluate 10 items on 2 proc:")
    pool.ncpus = 2
    print(pool)
    print(pool.map(add_me, range(10)))
    print('')

    # test 'dilled' multiprocessing for lambda
    print("Evaluate 10 items on 4 proc:")
    pool.ncpus = 4
    print(pool)
    print(pool.map(squ, range(10)))
    print('')

    # test for lambda, but with threads
    print("Evaluate 10 items on 4 threads:")
    tpool.nthreads = 4
    print(tpool)
    print(tpool.map(squ, range(10)))
    print('')
Beispiel #39
0
    def f_to_equilibrium_fermi_level(self, temperature, semiconductor, f, electron_volts=False,
                                     use_mpmath=False, parallel=False, debug=False):
        """
        Calculates equilibrium Fermi level position for given occupation of the Trap F
        :param temperature: Temperature, K
        :param semiconductor: Semiconductor object
        :param f: Trap occupation from 0.0 to 1.0
        :param electron_volts: if True assume all energy values to be in eV
        :param use_mpmath: if True integration is done using mpmath.quad function instead of numpy.trapz (default)
        :param debug: if True prints out some debug information
        :return: Fermi level as distance from Conduction band to Fermi level

        In calculation we use eV since solver has much better stability in smaller order numbers
        """
        energy_unit = 'eV'
        energy_coefficient = to_numeric(q)
        trap_energy_level = self.energy_level(temperature, semiconductor, charge_state_idx=0,
                                              electron_volts=True)
        f_grid, = np.meshgrid(f)
        #print f_grid
        def equation(fermi_level_from_conduction_band, f):
            test_f = self.equilibrium_f(temperature, semiconductor, fermi_level_from_conduction_band,
                                        electron_volts=True, use_mpmath=use_mpmath, debug=debug)
            residual = f - test_f
            if debug:
                print 'Fermi level type:', type(fermi_level_from_conduction_band)
                print 'Test F =', test_f
                print 'F residual =', residual
            return residual

        def solver(args):
            equation, lower_boundary, upper_boundary, initial_guess, f, use_mpmath = args
            if not use_mpmath:
                warnings.filterwarnings('ignore')
                solution = root(equation, initial_guess, args=f, method='hybr')
                solution = solution.x[0]
                warnings.resetwarnings()
            else:
                equation_wrap = partial(equation, f=f)
                try:
                    solution = mp.findroot(equation_wrap, (lower_boundary, upper_boundary),
                                           maxsteps=1000, solver='anderson', tol=5e-16)
                except ValueError as err:
                    print err
                    print 'Lowering tolerance to 5e-6'
                    solution = mp.findroot(equation_wrap, (lower_boundary, upper_boundary),
                                           maxsteps=1000, solver='anderson', tol=5e-6)
                solution = np.float(solution)
            return solution

        fermi_level_lower_boundary = abs(to_numeric(trap_energy_level - 2 * self.energy_spread / energy_coefficient))
        fermi_level_upper_boundary = abs(to_numeric(trap_energy_level + 2 * self.energy_spread / energy_coefficient))
        if debug:
            print 'Fermi level lower boundary = %2.2g ' % fermi_level_lower_boundary + energy_unit
            print 'Fermi level upper boundary = %2.2g ' % fermi_level_upper_boundary + energy_unit
        args = np.empty((len(f_grid), 6), dtype=object)
        args[:, 0] = equation
        args[:, 1] = fermi_level_lower_boundary
        args[:, 2] = fermi_level_upper_boundary
        args[:, 3] = (fermi_level_lower_boundary + fermi_level_upper_boundary) / 2
        args[:, 4] = f_grid
        args[:, 5] = use_mpmath
        if parallel:
            try:
                from pathos.pools import ProcessPool as Pool
                pool = Pool()
                solutions = np.array(pool.map(solver, args))
            except ImportError:
                print 'Parallel calculation needs pathos! Using standard map() instead.'
                solutions = np.array(map(solver, args))
        else:
            solutions = np.array(map(solver, args))
        if not electron_volts:
            solutions *= energy_coefficient
        return solutions
Beispiel #40
0
#!/usr/bin/env python
#
# Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
# Copyright (c) 1997-2015 California Institute of Technology.
# License: 3-clause BSD.  The full license text is available at:
#  - http://trac.mystic.cacr.caltech.edu/project/pathos/browser/pathos/LICENSE

from pathos.helpers import freeze_support
freeze_support()

# instantiate and configure the worker pool
from pathos.pools import ProcessPool
pool = ProcessPool(nodes=4)

_result = map(pow, [1,2,3,4], [5,6,7,8]) 

# do a blocking map on the chosen function
result = pool.map(pow, [1,2,3,4], [5,6,7,8])
assert result == _result

# do a non-blocking map, then extract the result from the iterator
result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8])
result = list(result_iter)
assert result == _result

# do an asynchronous map, then get the results
result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8])
result = result_queue.get()
assert result == _result
    db_name = '03_Au_nSi_BW_transient' + '_%02.2fVp_%02.2fVrb_%03.2fK' % (V_p, abs(V_rb), T) + '.db'
    db_name = join(sample_data_dir, db_name)
    MyProject = Project(db_name=db_name, backend='sqlite', hostname='', overwrite=False)
    MyDiode = SchottkyDiode(MyProject, 'Au-Si_BW', Electrode, Si, DeadLayer=1.5e-7, L=5e-6)
    MyDiode.set_T(T)
    MyDiode.set_Va(V_p)
    print T

    type_sign = -1 if MyDiode.Semiconductor.dop_type == 'n' else 1
    Psi = Psi_approx(MyDiode.L, -(MyDiode.V_bi(eV=True) + type_sign * V_p), 0.0)

    Psi, E, z_nodes, rho_err_points, Vd, Vd_err, J, J_err, \
        BI_F, dopants_F, ic_id = Poisson.Reccurent_Poisson_solver(MyDiode, Psi, Vd_error=1e-6,
                                                                  equilibrium_filling=True, t=mp.inf,
                                                                  initial_condition_id=-1,
                                                                  rho_rel_err=Poisson_rel_err, max_iter=100,
                                                                  debug=False)
    return db_name


args = np.empty((len(T_range), 4), dtype=object)
args[:, 0] = silicon
args[:, 1] = electrode
args[:, 2] = T_range
args[:, 3] = V_p

pool = Pool()
db_names = np.array(pool.map(calculate, args))
print db_names

Beispiel #42
0
                          'diode_voltage': diode_voltage, 'diode_voltage_error': diode_voltage_error,
                          'current_density': current_density, 'current_density_error': current_density_error}}
    for trap_key in bonding_interface_f.keys():
        data[temperature][trap_key] = bonding_interface_f[trap_key]
    for dopant_key in dopants_f_sum.keys():
        data[temperature][dopant_key + '_sum'] = dopants_f_sum[dopant_key]
    return potential, field, data, z, dopants_f

args = np.empty((len(temperature_range), 4), dtype=object)
args[:, 0] = silicon
args[:, 1] = electrode
args[:, 2] = temperature_range
args[:, 3] = [voltage_range for _ in range(len(temperature_range))]

pool = Pool()
results = np.array(pool.map(calculate_for_temperature, args))
potential = results[:, 0]
field = results[:, 1]
data = results[:, 2]
dopants_f = results[:, 4]

_, ax_iv = plt.subplots()
ax_iv.set_title('I-V temperature dependence')
ax_iv.set_xlabel('Voltage drop on diode, V')
ax_iv.set_ylabel('Current density, A / m^2')
for record in data:
    temperature = record.keys()[0]
    print 'T = %03.2f K' % temperature
    df = pd.DataFrame(record[temperature])
    df.plot(x='diode_voltage', y='current_density', ax=ax_iv, style='-o', label='%2.2f K' % temperature)