Beispiel #1
0
def main():
    logger.info("Starting on %s with %d CPU's.", socket.gethostname(),
                multiprocessing.cpu_count())
    args = parse_args()
    if args.link_run is not None:
        run_json = link_json(args.link_run, args.data_path)
        run_json.has_runinfo = True
    else:
        json_path = os.path.join(args.data_path, 'input', 'AppSession.json')
        try:
            with open(json_path, 'r') as json_file:
                run_json = parse_json(json_file)
        except:
            if os.path.exists(json_path):
                # copy the input file to the output dir for postmortem analysis
                logger.error("Error occurred while parsing '%s'" % json_path)
                with open(json_path, 'r') as json_file:
                    file_cont = json_file.read()
                out_path = os.path.join(args.data_path, 'logs',
                                        'AppSession.json')
                with open(out_path, 'w') as json_file:
                    json_file.write(file_cont)
            else:
                logger.error("Error: no such file as '%s'" % json_path)
            raise
        # Do we have run_ids for all sample_ids ?
        if run_json.run_id is None:
            run_json.has_runinfo = False
        else:
            bs = BSrequest()
            sample_id_set = bs.check_run_sample_ids(
                [run_json.run_id], [s["Id"] for s in run_json.samples])
            run_json.has_runinfo = (len(sample_id_set) == len(
                run_json.samples))
        logger.info("setting json.has_run_info to %s" % run_json.has_runinfo)
    pssm = Pssm()

    scratch_path = os.path.join(args.data_path, 'scratch')
    makedirs(scratch_path)
    for filename in os.listdir(scratch_path):
        filepath = os.path.join(scratch_path, filename)
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    args.g2p_path = args.qc_path = create_app_result(args.data_path,
                                                     run_json,
                                                     suffix='results')
    if run_json.run_id is None:
        run_summary = None
    else:
        logger.info('Summarizing run.')
        run_summary = summarize_run(args, run_json)

    pool = Pool()
    pool.map(
        functools.partial(try_sample, run_info=run_json, args=args, pssm=pssm),
        range(len(run_json.samples)))

    pool.close()
    pool.join()
    collate_samples(args, run_json)
    if run_json.run_id is not None:
        summarize_samples(args, run_json, run_summary)
    logger.info('Done.')
Beispiel #2
0
def quality_over_theta():
    number_dataset = 1
    data, target, enable_i = datasets[number_dataset]

    pool = Pool(processes=5)

    # if we want to average
    nb_launched = 5

    theta = 0.1

    data_final = {'WRAcc': [], 'theta': [], 'Algorithm': []}

    for i in range(10):
        print('Iteration: {}'.format(i))
        for i in range(nb_launched):
            results_misere = pool.apply_async(misere, (data, target), {
                'time_budget': TIME_BUDGET_XP,
                'theta': theta
            })
            results_beam = pool.apply_async(
                beam_search, (data, target), {
                    'enable_i': enable_i,
                    'time_budget': TIME_BUDGET_XP,
                    'theta': theta
                })

            result_ucb_opti = pool.apply_async(
                seq_scout, (data, target), {
                    'enable_i': enable_i,
                    'time_budget': TIME_BUDGET_XP,
                    'theta': theta
                })

            results_misere = results_misere.get()
            results_beam = results_beam.get()
            result_ucb_opti = result_ucb_opti.get()

            if len(results_beam) < TOP_K:
                print("Too few beam: {}".format(len(results_beam)))
            if len(result_ucb_opti) < TOP_K:
                print("Too few seqscout: {}".format(len(result_ucb_opti)))
            if len(results_misere) < TOP_K:
                print("Too few misere: {}".format(len(results_misere)))

            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_misere)),
                             theta=theta,
                             Algorithm='misere')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_beam)),
                             theta=theta,
                             Algorithm='beam')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(result_ucb_opti)),
                             theta=theta,
                             Algorithm='seqscout')

        theta += 0.1

    df = pd.DataFrame(data=data_final)

    sns.set(rc={'figure.figsize': (8, 6.5)})

    plt.clf()
    ax = sns.lineplot(data=df, x='theta', y='WRAcc', hue='Algorithm')

    plt.savefig('./theta/over_theta.png')

    df.to_pickle('./theta/result')

    if SHOW:
        plt.show()
Beispiel #3
0
def boxplot_dataset_iterations():
    pool = Pool(processes=5)
    xp_repeat = 5

    data_final = {'WRAcc': [], 'dataset': [], 'Algorithm': []}

    for i, (data, target, enable_i) in enumerate(datasets):
        print("Dataset {}".format(datasets_names[i]))

        for j in range(xp_repeat):
            results_misere = pool.apply_async(misere, (data, target),
                                              {'time_budget': TIME_BUDGET_XP})
            results_beam = pool.apply_async(beam_search, (data, target), {
                'enable_i': enable_i,
                'time_budget': TIME_BUDGET_XP
            })
            result_ucb_opti = pool.apply_async(seq_scout, (data, target), {
                'enable_i': enable_i,
                'time_budget': TIME_BUDGET_XP
            })

            results_misere = results_misere.get()
            results_beam = results_beam.get()
            result_ucb_opti = result_ucb_opti.get()

            if len(results_misere) < TOP_K:
                print("Too few example on misere on dataset {}: {} results".
                      format(datasets_names[i], len(results_misere)))
            if len(results_beam) < TOP_K:
                print(
                    "Too few example on beam_search on dataset {}: {} results".
                    format(datasets_names[i], len(results_beam)))
            if len(result_ucb_opti) < TOP_K:
                print("Too few example on seqscout on dataset {}: {} results".
                      format(datasets_names[i], len(result_ucb_opti)))

            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_misere)),
                             dataset=datasets_names[i],
                             Algorithm='misere')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_beam)),
                             dataset=datasets_names[i],
                             Algorithm='beam')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(result_ucb_opti)),
                             dataset=datasets_names[i],
                             Algorithm='seqscout')

    df = pd.DataFrame(data=data_final)

    sns.set(rc={'figure.figsize': (8, 6.5)})

    plt.clf()
    ax = sns.barplot(x='dataset', y='WRAcc', hue='Algorithm', data=df)

    plt.savefig('./wracc_datasets/iterations_boxplot.png')
    df.to_pickle('./wracc_datasets/result')

    if SHOW:
        plt.show()
Beispiel #4
0
    def _read_obs(self, stns_ids=None):

        # Saw extreme decreased performance due to garbage collection when
        # pandas ran checks for a chained assignment. Turn off this check
        # temporarily.
        opt_val = pd.get_option('mode.chained_assignment')
        pd.set_option('mode.chained_assignment', None)

        try:

            if stns_ids is None:
                stns_obs = self.stns
            else:
                stns_obs = self.stns.loc[stns_ids]
            
            nstns = len(stns_obs.station_id)
            nprocs = self.nprocs if nstns >= self.nprocs else nstns
            
            if self.has_start_end_dates:
                start_end = (self.start_date, self.end_date)
            else:
                start_end = None
            
            if nprocs > 1:
                
                # http://stackoverflow.com/questions/24171725/
                # scikit-learn-multicore-attributeerror-stdin-instance-
                # has-no-attribute-close
                if not hasattr(sys.stdin, 'close'):
                    def dummy_close():
                        pass
                    sys.stdin.close = dummy_close
                
                iter_stns = [(None, a_id, self.elems, start_end)
                             for a_id in stns_obs.station_id]
                
                pool = Pool(processes=nprocs)                
                obs = pool.map(_parse_ghcnd_dly_star_remote, iter_stns)
                
                pool.close()
                pool.join()
            
            else:
            
                obs = []
    
                for a_id in stns_obs.station_id:
                    
                    abuf = open_remote_file('https://www1.ncdc.noaa.gov/'
                                            'pub/data/ghcn/daily/all/%s.dly' % a_id)
                                       
                    obs_stn = _parse_ghcnd_dly(abuf, a_id, self.elems, start_end)
                    obs.append(obs_stn)

            df_obs = pd.concat(obs, ignore_index=True)

        finally:

            pd.set_option('mode.chained_assignment', opt_val)

        df_obs = df_obs.set_index(['station_id', 'elem', 'time'])
        df_obs = df_obs.sortlevel(0, sort_remaining=True)

        return df_obs
Beispiel #5
0
def show_quality_over_iterations_ucb(number_dataset):
    data, target, enable_i = datasets[number_dataset]

    # if we want to average
    nb_launched = 5
    pool = Pool(processes=3)

    iterations_limit = 50
    iterations_step = 1000

    data_final = {'WRAcc': [], 'iterations': [], 'Algorithm': []}

    for i in range(12):
        print('Iteration: {}'.format(i))

        for i in range(nb_launched):
            results_misere = pool.apply_async(
                misere, (data, target), {
                    'time_budget': TIME_BUDGET_XP,
                    'iterations_limit': iterations_limit
                })
            results_beam = pool.apply_async(
                beam_search, (data, target), {
                    'enable_i': enable_i,
                    'time_budget': TIME_BUDGET_XP,
                    'iterations_limit': iterations_limit
                })

            result_ucb_opti = pool.apply_async(
                seq_scout, (data, target), {
                    'enable_i': enable_i,
                    'time_budget': TIME_BUDGET_XP,
                    'iterations_limit': iterations_limit
                })

            data_add_generic(data_final,
                             WRAcc=max(0,
                                       average_results(results_misere.get())),
                             iterations=iterations_limit,
                             Algorithm='misere')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_beam.get())),
                             iterations=iterations_limit,
                             Algorithm='beam')
            data_add_generic(data_final,
                             WRAcc=max(0,
                                       average_results(result_ucb_opti.get())),
                             iterations=iterations_limit,
                             Algorithm='seqscout')

        iterations_limit += iterations_step

    df = pd.DataFrame(data=data_final)

    sns.set(rc={'figure.figsize': (8, 6.5)})

    plt.clf()
    ax = sns.lineplot(data=df,
                      x='iterations',
                      y='WRAcc',
                      hue='Algorithm',
                      markers=True)

    plt.savefig('./iterations_ucb/over_iterations{}.png'.format(
        datasets_names[number_dataset]))
    df.to_pickle('./iterations_ucb/result{}'.format(
        datasets_names[number_dataset]))

    if SHOW:
        plt.show()
Beispiel #6
0
def get_data_for_linker(
    catalog: str,
    entity: str,
    qids: Set[str],
    url_pids: Set[str],
    ext_id_pids_to_urls: Dict,
    qids_and_tids: Dict,
    fileout: TextIO,
) -> None:
    """Collect relevant data for linking Wikidata to a given catalog.
    Dump the result to a given output stream.

    This function uses multithreaded parallel processing.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param qids: a set of QIDs
    :param url_pids: a set of PIDs holding URL values.
      Returned by :py:func:`soweego.wikidata.sparql_queries.url_pids`
    :param ext_id_pids_to_urls: a
      ``{PID: {formatter_URL: (id_regex, url_regex,)} }`` dict.
      Returned by
      :py:func:`soweego.wikidata.sparql_queries.external_id_pids_and_urls`
    :param fileout: a file stream open for writing
    :param qids_and_tids: a ``{QID: {'tid': {catalog_ID_set} }`` dict.
      Populated by
      :py:func:`soweego.commons.data_gathering.gather_target_ids`
    """
    qid_buckets, request_params = _prepare_request(
        qids, 'labels|aliases|descriptions|sitelinks|claims')

    # Catalog-specific data needs
    if catalog in constants.REQUIRE_OCCUPATION.keys():
        needs_occupation = entity in constants.REQUIRE_OCCUPATION[catalog]
    else:
        needs_occupation = False
    needs_genre = entity in constants.REQUIRE_GENRE
    needs_publication_date = entity in constants.REQUIRE_PUBLICATION_DATE

    # Initialize 7 counters to 0
    # Indices legend:
    # 0 = claims
    # 1 = labels
    # 2 = aliases
    # 3 = descriptions
    # 4 = sitelinks
    # 5 = third-party URLs
    # 6 = third-party IDs
    counters = [0] * 7

    # Create a partial function where all parameters
    # but the data bucket are passed to `_process_bucket`,
    # so that we only pass the data bucket
    # when we call `pool_function`.
    # In this way, it becomes trivial to use
    # `multiprocessing.Pool` map functions, like `imap_unordered`
    pool_function = partial(
        _process_bucket,
        request_params=request_params,
        url_pids=url_pids,
        ext_id_pids_to_urls=ext_id_pids_to_urls,
        qids_and_tids=qids_and_tids,
        needs=(needs_occupation, needs_genre, needs_publication_date),
        counters=counters,
    )

    # Create a pool of threads and map the list of buckets via `pool_function`
    with Pool() as pool:
        # `processed_bucket` will be a list of dicts, where each dict
        # is a processed entity from the bucket
        for processed_bucket in pool.imap_unordered(
                pool_function, tqdm(qid_buckets, total=len(qid_buckets))):
            # Join results into a string so that we can write them to
            # the dump file
            to_write = ''.join(
                json.dumps(result, ensure_ascii=False) + '\n'
                for result in processed_bucket)

            fileout.write(to_write)
            fileout.flush()

    LOGGER.info(
        'QIDs: got %d with no expected claims, %d with no labels, '
        '%d with no aliases, %d with no descriptions, %d with no sitelinks, '
        '%d with no third-party links, %d with no external ID links',
        *counters)
Beispiel #7
0
def _build_tobs_hdfs(path_out, fpaths_yrly, elems, nprocs=1):
    
    fpaths_yrly = np.array(fpaths_yrly)
    nprocs = nprocs if fpaths_yrly.size >= nprocs else fpaths_yrly.size
        
    stn_nums = pd.DataFrame([(np.nan, np.nan)], columns=['station_id', 'station_num'])
    num_inc = 0 
    
    first_append = {elem:True for elem in elems}
    
    # assume ~1.5 millions rows per year to estimate expected number of rows
    erows = 1500000 * len(fpaths_yrly)
    
    def write_data(df_tobs, num_inc, stn_nums):
    
        hdfs = {elem:pd.HDFStore(os.path.join(path_out, '%s.hdf' % elem), 'a')
                for elem in elems}
                
        df_tobs.set_index('station_id', inplace=True)
        df_tobs['obs_value'] = df_tobs.obs_value.astype(np.int16)
        
        uids = pd.DataFrame(df_tobs.index.unique(), columns=['station_id'])
        uids = uids.merge(stn_nums, how='left', on='station_id')
        mask_nonum = uids.station_num.isnull()
        
        if mask_nonum.any():
            
            nums = np.arange(num_inc, (num_inc + mask_nonum.sum()))
            uids.loc[mask_nonum, 'station_num'] = nums
            num_inc = nums[-1] + 1
            stn_nums = pd.concat([stn_nums, uids[mask_nonum]], ignore_index=True)
        
        uids.set_index('station_id', inplace=True)
        uids['station_num'] = uids.station_num.astype(np.int)
        
        df_tobs = df_tobs.join(uids, how='left').set_index('station_num')
        grped = df_tobs.groupby('elem')
        
        for elem in elems:
            
            try:
                grp = grped.get_group(elem)[['time', 'obs_value']].copy()
            except KeyError:
                # no observation for element
                continue
            
            if first_append[elem]:
                
                hdfs[elem].append('df_tobs', grp, data_columns=['time'],
                                  expectedrows=erows, index=False)
                first_append[elem] = False
            
            else:
            
                hdfs[elem].append('df_tobs', grp, data_columns=['time'], index=False)
        
        for store in hdfs.values():
            store.close()
        
        return num_inc, stn_nums
    
    # Initialize output hdfs
    hdfs = [pd.HDFStore(os.path.join(path_out, '%s.hdf' % elem), 'w')
            for elem in elems]
    
    for store in hdfs:
        store.close()
    
    if nprocs > 1:
        
        # http://stackoverflow.com/questions/24171725/
        # scikit-learn-multicore-attributeerror-stdin-instance-
        # has-no-attribute-close
        if not hasattr(sys.stdin, 'close'):
            def dummy_close():
                pass
            sys.stdin.close = dummy_close
        
        for i in np.arange(fpaths_yrly.size, step=nprocs):
            
            fpaths = fpaths_yrly[i:(i + nprocs)]
            gc.collect()
            pool = Pool(processes=nprocs)                
            iter_files = [(fpath, elems) for fpath in fpaths]
            ls_tobs = pool.map(_parse_ghcnd_yrly_star, iter_files, chunksize=1)
            pool.close()
            pool.join()
            
            for df_tobs in ls_tobs:
            
                num_inc, stn_nums = write_data(df_tobs, num_inc, stn_nums)
                
            del df_tobs
            del ls_tobs
            
                
    else:
        
        for fpath in fpaths_yrly:
            
            df_tobs = _parse_ghcnd_yrly(fpath, elems)
            num_inc, stn_nums = write_data(df_tobs, num_inc, stn_nums)
    
    stn_nums = stn_nums.dropna()
    store_stnnums = pd.HDFStore(os.path.join(path_out, 'stn_nums.hdf'), 'w')
    store_stnnums.put('df_stnnums', stn_nums)
    store_stnnums.close()
    
    # Create indexess
    for elem in elems:
        
        with pd.HDFStore(os.path.join(path_out, '%s.hdf' % elem)) as store:
            
            store.create_table_index('df_tobs', optlevel=9, kind='full')
Beispiel #8
0
        response = client.rpc_add(1, 2)
        return response.data.result


def long_time_task():
    for ii in range(1000):
        # response = client.rpc_add(1, 3)

        client = get_client2()
        res = main2(client)

        # client = get_client()
        # res = main(client)
        print(ii)


if __name__ == '__main__':

    t = time.time()

    from multiprocessing.pool import Pool
    p = Pool()
    for i in range(4):
        p.apply_async(long_time_task, args=())
    p.close()
    p.join()

    print(time.time() - t)
    time.sleep(2)
    tracer.close()
Beispiel #9
0
    def predict(self, prediction_object):
        threshold = self.prediction_threshold
        predictions = list()
        if isinstance(prediction_object, Commit):
            # Predict
            open_issues = [
                i for i in self.repository_obj.issues if
                # (len(i.states) == 0 or i.states[-1].to_ == IssueStates.open)
                # or
                (min([
                    abs(entity.timestamp - prediction_object.timestamp
                        ) if hasattr(entity, 'timestamp') and entity.timestamp
                    else timedelta(days=self.net_size_in_days, seconds=1)
                    for entity in [i.original_post] + i.states + i.actions
                ]) <= timedelta(days=self.net_size_in_days))
            ]
            open_issues += [null_issue]
            prediction_data = list()

            if len(open_issues) > 128:
                with Pool(processes=os.cpu_count() - 1) as wp:
                    for point in wp.map(func=Issue_Closure(
                            prediction_object, self.feature_generator),
                                        iterable=open_issues,
                                        chunksize=128):
                        prediction_data.append(point)
            else:
                for issue in open_issues:
                    prediction_data.append(
                        self.feature_generator.generate_features_commit(
                            issue, prediction_object, False))

            for point in prediction_data:
                probabilities = self.clf.predict_proba(
                    np.array(
                        tuple([
                            v for k, v in point.items()
                            if k not in ['linked', 'issue', 'commit']
                        ])).reshape(1, -1))
                if point['issue'] == 'null_issue':
                    threshold = max(threshold, probabilities[0][1])
                else:
                    prediction = (point['issue'], float(probabilities[0][1]))
                    predictions.append(prediction)
            predictions = sorted([p for p in predictions if p[1] >= threshold],
                                 key=lambda p: (p[1], p[0]),
                                 reverse=True)
            response = prediction_object.c_hash, predictions
        elif isinstance(prediction_object, Issue):
            # Predict
            candidates = [
                c for c in self.repository_obj.commits if (min([
                    abs(entity.timestamp - c.timestamp
                        ) if hasattr(entity, 'timestamp') and entity.timestamp
                    else timedelta(days=self.net_size_in_days, seconds=1)
                    for entity in [prediction_object.original_post] +
                    prediction_object.states + prediction_object.actions
                ]) <= timedelta(days=self.net_size_in_days))
            ]
            candidates += [null_commit]
            prediction_data = list()

            if len(candidates) > 128:
                with Pool(processes=os.cpu_count() - 1) as wp:
                    for point in wp.map(func=Commit_Closure(
                            prediction_object, self.feature_generator),
                                        iterable=candidates,
                                        chunksize=128):
                        prediction_data.append(point)
            else:
                for commit in candidates:
                    prediction_data.append(
                        self.feature_generator.generate_features_commit(
                            prediction_object, commit, False))

            for point in prediction_data:
                probabilities = self.clf.predict_proba(
                    np.array(
                        tuple([
                            v for k, v in point.items()
                            if k not in ['linked', 'issue', 'commit']
                        ])).reshape(1, -1))
                if point['commit'] == 'null_commit':
                    threshold = max(threshold, probabilities[0][1])
                else:
                    prediction = (point['commit'], float(probabilities[0][1]))
                    predictions.append(prediction)
            predictions = sorted([p for p in predictions if p[1] >= threshold],
                                 key=lambda p: (p[1], p[0]),
                                 reverse=True)
            response = prediction_object.id_, predictions
        if self.use_sim_cs or self.use_sim_j or self.use_sim_d or self.use_file:
            if self.predictions_from_last_tf_idf_update < self.predictions_between_updates:
                self.predictions_from_last_tf_idf_update += 1
            else:
                self.predictions_from_last_tf_idf_update = 0
                temporal_config = None
                self.model, self.dictionary, new_cache = generate_tfidf_commit(
                    self.repository_obj,
                    self.stopwords,
                    self.min_tok_len,
                    cache=self.feature_generator.text_cache)
                similarity_config = {
                    'dict': self.dictionary,
                    'model': self.model,
                    'min_len': self.min_tok_len,
                    'stopwords': self.stopwords,
                }
                if self.use_temporal:
                    self.fingerprint = None
                    temporal_config = {
                        'fingerprint': self.fingerprint,
                        'net_size_in_days': self.net_size_in_days,
                    }
                self.feature_generator = FeatureGenerator(
                    use_file=self.use_file,
                    use_sim_cs=self.use_sim_cs,
                    use_sim_j=self.use_sim_j,
                    use_sim_d=self.use_sim_d,
                    use_social=self.use_social,
                    use_temporal=self.use_temporal,
                    use_pr_only=self.use_pr_only,
                    use_issue_only=self.use_issue_only,
                    similarity_config=similarity_config,
                    temporal_config=temporal_config,
                    text_cache=new_cache,
                    selected=self.features,
                )
        return response
Beispiel #10
0
    def execute(self):
        yesterday = (datetime.datetime.now() -
                     datetime.timedelta(days=1)).strftime("%Y-%m-%d 00:00:00")
        today = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")
        tomorrow = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).strftime("%Y-%m-%d 00:00:00")

        # self.etl()

        sql = (
            "select * from {} where SBMT_TMSTMP >= to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') "
            "and SBMT_TMSTMP < to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') ")

        # KC21 empi
        batch_rows = self.batch_rows(sql.format(KC21, yesterday, today))
        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.empi, (row, "KC21"))
                task.close()
                task.join()
            except StopIteration:
                break

        batch_rows = self.batch_rows(sql.format(N041, yesterday, today))
        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.process, (row, "N041"))
                task.close()
                task.join()
            except StopIteration:
                break

        # pairing check
        # N041
        batch_rows = self.batch_rows(sql.format(N041, yesterday, today))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.pairing_check, (row, "N041"))
                task.close()
                task.join()
            except StopIteration:
                break

        # KC21
        batch_rows = self.batch_rows(sql.format(KC21, yesterday, today))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.pairing_check, (row, "KC21"))
                task.close()
                task.join()
            except StopIteration:
                break

        # KC24
        batch_rows = self.batch_rows(sql.format(KC24, yesterday, today))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.pairing_check, (row, "KC24"))
                task.close()
                task.join()
            except StopIteration:
                break

        sql = (
            "select * from {} where etl_date >= to_date('{}', 'yyyy-mm-dd hh24:mi:ss') "
            "and etl_date < to_date('{}', 'yyyy-mm-dd hh24:mi:ss')").format(
                ETL, today, tomorrow)

        row = self.row(sql)
        row["RUN"] = "True"
        self.insert_or_update(ETL, row, key="RUN")
Beispiel #11
0
    def getting_start(self):
        sql = ("select * from {}")

        # KC21 EMPI
        print("Start processing...")

        batch_rows = self.batch_rows(sql.format(KC21))
        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.empi, (row, "KC21"))
                task.close()
                task.join()
            except StopIteration:
                break

        batch_rows = self.batch_rows(sql.format(N041))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.process, (row, "N041"))
                task.close()
                task.join()
            except StopIteration:
                break

        # pairing check
        # N041
        batch_rows = self.batch_rows(sql.format(N041))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.pairing_check, (row, "N041"))
                task.close()
                task.join()
            except StopIteration:
                break

        # KC21
        batch_rows = self.batch_rows(sql.format(KC21))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.pairing_check, (row, "KC21"))
                task.close()
                task.join()
            except StopIteration:
                break

        # KC24
        batch_rows = self.batch_rows(sql.format(KC24))

        while True:
            try:
                batch = next(batch_rows)
                m = Manager()
                share_ls = m.list()
                share_ls.extend(batch)
                del batch
                task = Pool(self.__process_count)
                for row in share_ls:
                    task.apply_async(self.pairing_check, (row, "KC24"))
                task.close()
                task.join()
            except StopIteration:
                break
Beispiel #12
0
    meanexp = getMean(prediction_test)
    meanofdiff = getMeanofDiffs(ground_test, prediction_test)
    pvarfe = getPvar(ground_test, meanfe)
    pvarexp = getPvar(prediction_test, meanexp)
    ccc_test = getCCC(pvarfe, pvarexp, meanofdiff, meanfe, meanexp)
    print(f"CV = {i}, Test >>> gamma = {best_gamma}, C = {best_c}, RMSE. ={rmse_test}, Spearman = {spearman_test}, CCC = {ccc_test}")
    logger.info(
        f"CV = {i}, Test >>> gamma = {best_gamma}, C = {best_c}, RMSE. ={rmse_test}, Spearman = {spearman_test}, CCC = {ccc_test}")

    # Save
    df = pd.DataFrame(data={"vggish_prediction_D": prediction_test, "vggish_groundtruth_D": test_y.values.tolist()})
    df.to_csv(f"./Prediction_202106_Ratio631/CV{i}_vggish_Dominance_0621.csv")
    print("save success!")
    print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n")
    logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>")
    return [rmse_test,spearman_values_test,ccc_test]

if __name__ == '__main__':
    pool = Pool(int(os.getenv('N_PROC', os.cpu_count())))
    futures = [pool.apply_async(func=svr, args=[i]) for i in range(1, 11)]
    pool.close() # 关闭pool,使其不在接受新的(主进程)任务
    average_rmse_test, average_pearson_test, average_ccc_test = [], [], []
    for item in futures:
        result = item.get()
        average_rmse_test.append(result[0])
        average_pearson_test.append(result[1])
        average_ccc_test.append(result[2])
    print(f"Vggish Regression Average Results of Dominance: RMSE.= {mean(average_rmse_test)}, Spearman = {mean(average_pearson_test)}, CCC = {mean(average_ccc_test)}")
    logger.info(
        f"/n/n/n Vggish Regression Average Results of Dominance: RMSE.= {mean(average_rmse_test)}, Spearman = {mean(average_pearson_test)}, CCC = {mean(average_ccc_test)}")
    pool.join()
Beispiel #13
0
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 segmentation_export_kwargs: dict = None):

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        if segmentation_export_kwargs is None:
            if 'segmentation_export_params' in self.plans.keys():
                force_separate_z = self.plans['segmentation_export_params'][
                    'force_separate_z']
                interpolation_order = self.plans['segmentation_export_params'][
                    'interpolation_order']
                interpolation_order_z = self.plans[
                    'segmentation_export_params']['interpolation_order_z']
            else:
                force_separate_z = None
                interpolation_order = 1
                interpolation_order_z = 0
        else:
            force_separate_z = segmentation_export_kwargs['force_separate_z']
            interpolation_order = segmentation_export_kwargs[
                'interpolation_order']
            interpolation_order_z = segmentation_export_kwargs[
                'interpolation_order_z']

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(2)
        results = []

        transpose_backward = self.plans.get('transpose_backward')

        for k in self.dataset_val.keys():
            properties = load_pickle(self.dataset[k]['properties_file'])
            data = np.load(self.dataset[k]['data_file'])['data']

            # concat segmentation of previous step
            seg_from_prev_stage = np.load(
                join(self.folder_with_segs_from_prev_stage,
                     k + "_segFromPrevStage.npz"))['data'][None]

            print(data.shape)
            data[-1][data[-1] == -1] = 0
            data_for_net = np.concatenate(
                (data[:-1],
                 to_one_hot(seg_from_prev_stage[0], range(1,
                                                          self.num_classes))))

            softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                data_for_net,
                do_mirroring=do_mirroring,
                mirror_axes=mirror_axes,
                use_sliding_window=use_sliding_window,
                step_size=step_size,
                use_gaussian=use_gaussian,
                all_in_gpu=all_in_gpu,
                mixed_precision=self.fp16)[1]

            if transpose_backward is not None:
                transpose_backward = self.plans.get('transpose_backward')
                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in transpose_backward])

            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]

            if save_softmax:
                softmax_fname = join(output_folder, fname + ".npz")
            else:
                softmax_fname = None
            """There is a problem with python process communication that prevents us from communicating obejcts 
            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
            filename or np.ndarray and will handle this automatically"""
            if np.prod(softmax_pred.shape) > (2e9 / 4 *
                                              0.85):  # *0.85 just to be save
                np.save(fname + ".npy", softmax_pred)
                softmax_pred = fname + ".npy"

            results.append(
                export_pool.starmap_async(
                    save_segmentation_nifti_from_softmax,
                    ((softmax_pred, join(output_folder, fname + ".nii.gz"),
                      properties, interpolation_order,
                      self.regions_class_order, None, None, softmax_fname,
                      None, force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]

        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(pred_gt_tuples,
                             labels=list(range(self.num_classes)),
                             json_output_file=join(output_folder,
                                                   "summary.json"),
                             json_name=job_name,
                             json_author="Fabian",
                             json_description="",
                             json_task=task)

        # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything
        # except the largest connected component for each class. To see if this improves results, we do this for all
        # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
        # have this applied during inference as well
        self.print_to_log_file("determining postprocessing")
        determine_postprocessing(self.output_folder,
                                 self.gt_niftis_folder,
                                 validation_folder_name,
                                 final_subf_name=validation_folder_name +
                                 "_postprocessed",
                                 debug=debug)
        # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
        # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = join(self.output_folder_base, "gt_niftis")
        maybe_mkdir_p(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError:
                    attempts += 1
                    sleep(1)

        self.network.train(current_mode)
        export_pool.close()
        export_pool.join()
    def run(self):
        # On compte le nombre de fichiers à traiter
        deb = time.time()
        self.fenetre.setpourcent(0)
        if len(os.listdir(self.data_entree)) == 0:
            self.fenetre.setInstruction("le dossier \"./results\" est vide.")
            return
        pas = 100. / len(os.listdir(self.data_entree))
        self.fenetre.setInstruction("Debut de la synthese des fichiers.")
        # pour chaque fichier dans./results/
        MultiListe = []
        for file in sorted(os.listdir(self.data_entree)):
            if file.endswith('.csv'):
                MultiListe.append([file, self.frequency])
        multiprocessing = Pool(self.nbproc)
        Values = multiprocessing.map(description, MultiListe)

        for resultat in Values:
            self.summary.append({
                'name': (resultat[0])[:(resultat[0]).rfind('-')],
                'id': (resultat[0])[(resultat[0]).rfind('-') +
                                    1:(resultat[0]).rfind('.')],
                'category':
                "\(O_O)/",
                'measurement_tools':
                self.measurement_tools,
                'measurement_mode':
                self.measurement_mode,
                'measurement_system':
                self.measurement_system,
                'measurement_method':
                self.measurement_method,
                'measurement_protocol':
                self.measurement_protocol,
                'min_value':
                resultat[1],
                'number_of_samples':
                resultat[11],
                'max_value':
                resultat[2],
                'mean':
                resultat[3],
                'stdev':
                resultat[4],
                'median_value':
                resultat[5],
                'sum_square':
                resultat[6],
                'sum_square/frequency':
                resultat[7],
                'percentile_25':
                resultat[8],
                'percentile_50':
                resultat[9],
                'percentile_75':
                resultat[10]
            })

            self.fenetre.setpourcent(self.fenetre.getpourcent() + pas)

        sorted(self.summary, key=lambda k: (k['name'], k['id']))
        summary = pd.DataFrame(
            self.summary,
            columns=[
                'name', 'id', 'category', 'measurement_tools',
                'measurement_system', 'measurement_mode', 'measurement_method',
                'measurement_protocol', 'number_of_samples', 'min_value',
                'max_value', 'mean', 'stdev', 'median_value', 'sum_square',
                'sum_square/frequency', 'percentile_25', 'percentile_50',
                'percentile_75'
            ])

        summary.to_csv('./synthese/{}_summary.csv'.format(self.data_sortie),
                       index=False,
                       sep=';')
        self.fenetre.setInstruction(
            "Travail termine.\nTemps total pris pour la synthese : " +
            str(round((time.time() - deb), 2)) + " secs")
Beispiel #15
0
def Pool(processes=None, initializer=None, initargs=()):
    '''
    Returns a process pool object
    '''
    from multiprocessing.pool import Pool
    return Pool(processes, initializer, initargs)
Beispiel #16
0
def run(name):
    print("%s子进程开始,进程ID:%d" % (name, os.getpid()))
    start = time()
    # 可以通过这个方式,保证每个程序开始运行的时间有微小的差异,
    # 可应用于爬虫。
    sleep(random.choice([1, 2, 3, 4]))
    end = time()
    print("%s子进程结束,进程ID:%d。耗时%0.2f" % (name, os.getpid(), end - start))


if __name__ == "__main__":
    print("父进程开始")
    # 创建多个进程,表示可以同时执行的进程数量。默认大小是CPU的核心数
    # 通常获取逻辑cpu个数,然后设置进程数量。
    p = Pool(4)
    for i in range(10):
        # 创建进程,放入进程池统一管理
        # 通过异步运行的方式,每运行一个run,创建一个进程,这些进程由进程池管理。
        p.apply_async(run, args=(i, ))
        # 也可以用同步的方式启动,每个进程需要等待程序结束再继续加载。
        # p.apply(run, args=(i,))
    # 如果我们用的是进程池,在调用join()之前必须要先close(),
    # 并且在close()之后不能再继续往进程池添加新的进程
    p.close()
    # 进程池对象调用join,会等待进程池中所有的子进程结束完毕再去结束父进程
    p.join()
    print("父进程结束。")
    p.terminate()

#
    def train(self, dump_reader, parallel=True,
              pool_size=multiprocessing.cpu_count(), chunk_size=100):
        self._word_counter = multiprocessing.Value(c_uint64, 0)
        self._word_alpha = multiprocessing.RawValue(
            c_float, self._word_initial_alpha
        )
        self._entity_alpha = multiprocessing.RawValue(
            c_float, self._entity_initial_alpha
        )

        logger.info('Initializing weights...')
        syn0_shared = multiprocessing.RawArray(
            c_float, len(self.dictionary) * self._size
        )
        syn0 = np.frombuffer(syn0_shared, dtype=REAL)
        syn0 = syn0.reshape(len(self.dictionary), self._size)
        for w in self.dictionary:
            if isinstance(w, Word):
                np.random.seed(np.uint32(hash(w.text)))
            elif isinstance(w, Entity):
                np.random.seed(np.uint32(hash(w.title)))
            else:
                RuntimeError('Unknown type')

            syn0[w.index] = (np.random.rand(self._size) - 0.5) / self._size

        syn1_shared = multiprocessing.RawArray(
            c_float, len(self.dictionary) * self._size
        )
        syn1 = np.frombuffer(syn1_shared, dtype=REAL)
        syn1 = syn1.reshape(len(self.dictionary), self._size)
        syn1.fill(0)

        self._total_words = int(sum(
            w.count for w in self.dictionary.words()
        ))
        self._total_words *= self._iteration
        logger.info('Total number of words: %d', self._total_words)

        word_neg_table = self._build_word_neg_table()
        entity_neg_table = self._build_entity_neg_table()

        logger.info('Starting to train a model...')

        def iter_dump_reader():
            for n in range(self._iteration):
                logger.info('Iteration: %d', n)
                for page in dump_reader:
                    yield page

        init_args = (
            self, syn0_shared, syn1_shared, word_neg_table, entity_neg_table
        )

        if parallel:
            pool = Pool(pool_size, initializer=init_worker, initargs=init_args)
            imap_func = partial(pool.imap_unordered, chunksize=chunk_size)
        else:
            init_worker(*init_args)
            imap_func = imap

        for (n, _) in enumerate(imap_func(train_page, iter_dump_reader())):
            if n % 10000 == 0:
                prog = float(self._word_counter.value) / self._total_words
                logger.info(
                    'Proccessing page #%d progress: %.1f%% '
                    'word alpha: %.3f entity alpha: %.3f',
                    n, prog * 100, self._word_alpha.value,
                    self._entity_alpha.value
                )

        if parallel:
            pool.close()

        self.syn0 = syn0
        self.syn1 = syn1
        self._word_neg_table = word_neg_table
        self._entity_neg_table = entity_neg_table
Beispiel #18
0
	initClosure(closurePath)
	initAnnotations(annotationsFilePath)
	
#	cafaFiles= ["/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outM1.txt","/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outM2.txt","/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outR.txt","/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outX.txt", "/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outY.txt", "/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outZ.txt", "/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outM2R.txt" ]
	cafaFiles= ["/mnt/home/hampt/workspace/doctorProject/src/CAFA/sprot_go.fasta.test.outX.txt"  ]
	

	
	methodToTargetToTermToScore = {}
	inputs = []

	for filePath in cafaFiles:
		inputs.append((filePath))
	
	print inputs
	pool = Pool(processes=10)
	resultMaps = pool.map(fillData, inputs, chunksize=1)
	for resultMap in resultMaps:
		for key, val in resultMap.iteritems():
			methodToTargetToTermToScore[key] = val
	
	targetToTermToMethodToScore = collections.defaultdict(dict)
	for method, methodDict in methodToTargetToTermToScore.iteritems():
		for target, targetDict in methodDict.iteritems():
			for term, score in targetDict.iteritems():
				targetToTermToMethodToScore[target].setdefault(term,{})[method] = score
	
#	outMeta = open("/mnt/home/hampt/workspace/doctorProject/src/CAFA/meta2.out",'w')
#	i=0
#	for target, targetDict in targetToTermToMethodToScore.iteritems():
#			for term, termDict in targetDict.iteritems():
Beispiel #19
0
import subprocess
from multiprocessing.pool import Pool

# mocking your code
modules = ["mod1.onnx", "mod2.onnx"]

def run(args):
  # pool.map will call run() with a tuple, deconstruct it
  idx, mod = args

  with open("output.txt", "w") as fp:
    subprocess.run(["./SOME_PROGRAM", str(idx)], stdout=fp)
    print("module", mod, "has finished")

# use the maximum number of threads by default
with Pool() as pool:
  pool.map(run, enumerate(modules))
            wer_inst = decoder.wer(transcript, reference)
            cer_inst = decoder.cer(transcript, reference)
            total_cer += cer_inst
            total_wer += wer_inst
            num_tokens += len(reference.split())
            num_chars += len(reference)

    wer = float(total_wer) / num_tokens
    cer = float(total_cer) / num_chars

    return [lm_alpha, lm_beta, wer * 100, cer * 100]


if __name__ == '__main__':
    p = Pool(args.num_workers, init,
             [args.beam_width,
              model.labels.index('_'), args.lm_path])

    cand_alphas = np.linspace(args.lm_alpha_from, args.lm_alpha_to,
                              args.lm_num_alphas)
    cand_betas = np.linspace(args.lm_beta_from, args.lm_beta_to,
                             args.lm_num_betas)
    params_grid = [(float(alpha), float(beta)) for alpha in cand_alphas
                   for beta in cand_betas]

    scores = []
    for params in tqdm(p.imap(decode_dataset, params_grid),
                       total=len(params_grid)):
        scores.append(list(params))
    print("Saving tuning results to: {}".format(args.output_path))
    with open(args.output_path, "w") as fh:
Beispiel #21
0
    def _read_obs(self, stns_ids=None):

        # Saw extreme decreased performance due to garbage collection when
        # pandas ran checks for a chained assignment. Turn off this check
        # temporarily.
        opt_val = pd.get_option('mode.chained_assignment')
        pd.set_option('mode.chained_assignment', None)

        try:

            if stns_ids is None:
                stns_obs = self.stns
            else:
                stns_obs = self.stns.loc[stns_ids]
            
            nstns = len(stns_obs.station_id)
            nprocs = self.nprocs if nstns >= self.nprocs else nstns
            
            if self.has_start_end_dates:
                start_end = (self.start_date, self.end_date)
            else:
                start_end = None
            
            if nprocs > 1:
                
                # http://stackoverflow.com/questions/24171725/
                # scikit-learn-multicore-attributeerror-stdin-instance-
                # has-no-attribute-close
                if not hasattr(sys.stdin, 'close'):
                    def dummy_close():
                        pass
                    sys.stdin.close = dummy_close
                
                iter_stns = [(os.path.join(self.path_ghcnd_data, 'ghcnd_all',
                                           '%s.dly' % a_id), a_id, self._elems,
                              start_end) for a_id in stns_obs.station_id]
                
                pool = Pool(processes=nprocs)                
                obs = pool.map(_parse_ghcnd_dly_star, iter_stns)
                
                pool.close()
                pool.join()
            
            else:
            
                obs = []
    
                for a_id in stns_obs.station_id:
                    
                    fpath = os.path.join(self.path_ghcnd_data, 'ghcnd_all',
                                         '%s.dly' % a_id)
                                       
                    obs_stn = _parse_ghcnd_dly(fpath, a_id, self._elems, start_end)
                    obs.append(obs_stn)

            df_obs = pd.concat(obs, ignore_index=True)

            if self._has_tobs:
                
                stnnums = stns_obs.join(self._df_tobs_stnnums).dropna(subset=['station_num'])
                
                if not stnnums.empty:
                    
                    stnnums = stnnums.reset_index(drop=True).set_index('station_num')
                        
                    select_str = "index = a_num"
                                    
                    df_tobs = []
                    path_yrly = os.path.join(self.path_ghcnd_data, 'by_year')
                    
                    for elem in self._elems_tobs:
                        
                        store = pd.HDFStore(os.path.join(path_yrly, '%s.hdf' % elem))
                        
                        # Perform separate read for each station.
                        # Had this in a single call using "index in stnnums"
                        # but memory usage was too high
                    
                        for a_num in stnnums.index:

                            elem_tobs = store.select('df_tobs', select_str).reset_index()
                            elem_tobs['elem'] = elem
                            elem_tobs['station_id'] = stnnums.station_id.loc[a_num]
                                                        
                            df_tobs.append(elem_tobs[['time', 'elem', 'obs_value',
                                                      'station_id']])
                        store.close()
                        del store
                        gc.collect()
                    
                    df_tobs = pd.concat(df_tobs, ignore_index=True)
                    
                    if self.has_start_end_dates:
                        
                        df_tobs = df_tobs[(df_tobs.time >= self.start_date) & 
                                          (df_tobs.time <= self.end_date)]
                    
                    df_obs = pd.concat([df_obs, df_tobs], ignore_index=True)

        finally:

            pd.set_option('mode.chained_assignment', opt_val)

        df_obs = df_obs.set_index(['station_id', 'elem', 'time'])
        df_obs = df_obs.sortlevel(0, sort_remaining=True)

        return df_obs
Beispiel #22
0
def run_cv(f, n_proc):
    p = Pool(n_proc)
    p.map(f, range(len(configs)))
    p.close()  # no more tasks
    p.join()  # wrap up current tasks
Beispiel #23
0
    try:
        local_image_url = item.get('image')
        new_image_url = local_image_url.replace('list', 'large')
        response = requests.get('http:' + new_image_url)
        if response.status_code == 200:
            file_path = '{0}/{1}.{2}'.format(item.get('title'),
                                             md5(response.content).hexdigest(),
                                             'jpg')
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    f.write(response.content)
            else:
                print('Already Downloaded', file_path)
    except requests.ConnectionError:
        print('Failed to save image')


def main(offset):
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)


if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups)
    pool.close()
    pool.join()
Beispiel #24
0
def Pool(processes=None, initializer=None, initargs=(), maxtasksperchild=None):
    '''
    Returns a process pool object
    '''
    from multiprocessing.pool import Pool
    return Pool(processes, initializer, initargs, maxtasksperchild)
Beispiel #25
0
def other_measures():
    pool = Pool(processes=5)
    xp_repeat = 5
    nb_iterations = 1000

    for i, (data, target, enable_i) in enumerate(datasets):
        print("Dataset {}".format(datasets_names[i]))

        for measure in ['Informedness', 'F1']:
            mean_misere = 0
            mean_beam = 0
            mean_seqscout = 0

            for j in range(xp_repeat):
                results_misere = pool.apply_async(
                    misere, (data, target), {
                        'time_budget': TIME_BUDGET_XP,
                        'quality_measure': measure,
                        'iterations_limit': nb_iterations
                    })
                results_beam = pool.apply_async(
                    beam_search, (data, target), {
                        'enable_i': enable_i,
                        'time_budget': TIME_BUDGET_XP,
                        'quality_measure': measure,
                        'iterations_limit': nb_iterations
                    })

                result_ucb_opti = pool.apply_async(
                    seq_scout, (data, target), {
                        'enable_i': enable_i,
                        'time_budget': TIME_BUDGET_XP,
                        'quality_measure': measure,
                        'iterations_limit': nb_iterations
                    })

                results_misere = results_misere.get()
                results_beam = results_beam.get()
                result_ucb_opti = result_ucb_opti.get()

                if len(results_misere) < TOP_K:
                    print(
                        "Too few example on misere on dataset {}: {} results".
                        format(datasets_names[i], len(results_misere)))
                if len(results_beam) < TOP_K:
                    print(
                        "Too few example on beam_search on dataset {}: {} results"
                        .format(datasets_names[i], len(results_beam)))
                if len(result_ucb_opti) < TOP_K:
                    print(
                        "Too few example on seqscout on dataset {}: {} results"
                        .format(datasets_names[i], len(result_ucb_opti)))

                mean_misere += average_results(results_misere)
                mean_beam += average_results(results_beam)
                mean_seqscout += average_results(result_ucb_opti)

            mean_misere = mean_misere / xp_repeat
            mean_beam = mean_beam / xp_repeat
            mean_seqscout = mean_seqscout / xp_repeat

            print(
                'For datasets {}, measure {}, algorithm misere the means score is: {}'
                .format(datasets_names[i], measure, mean_misere))
            print(
                'For datasets {}, measure {}, algorithm beam_search the means score is: {}'
                .format(datasets_names[i], measure, mean_beam))
            print(
                'For datasets {}, measure {}, algorithm seqscout the means score is: {}'
                .format(datasets_names[i], measure, mean_seqscout))
Beispiel #26
0
import subprocess
import pickle
import sys
import ntpath
from threading import Lock
from multiprocessing.pool import ThreadPool as Pool
import xml.etree.ElementTree as ET
import urllib2

checksums_dict = {}
checksums_dict_lock = Lock()
file_path_dict = {}
file_path_dict_lock = Lock()

pool_size = 5  # your "parallelness"
pool = Pool(pool_size)


class FileAttributes:
    def __init__(self):
        self.file_path=None
        self.file_name=None
        self.checksum=None
        self.itunes_key=-1
        self.itunes_file_path=None


def path_leaf(path):
    ''' Taken from https://stackoverflow.com/questions/8384737/extract-file-name-from-path-no-matter-what-the-os-path-format'''
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)
Beispiel #27
0
def quality_over_size():
    number_dataset = 6
    data_origin, target, enable_i = datasets[number_dataset]

    pool = Pool(processes=3)

    # if we want to average
    nb_launched = 5

    size = 15
    size_step = 4
    data_final = {'WRAcc': [], 'size': [], 'Algorithm': []}

    for i in range(10):
        print('Iteration: {}'.format(i))
        data = reduce_k_length(size, data_origin)
        for i in range(nb_launched):
            results_misere = pool.apply_async(misere, (data, target),
                                              {'time_budget': TIME_BUDGET_XP})

            results_beam = pool.apply_async(beam_search, (data, target), {
                'enable_i': enable_i,
                'time_budget': TIME_BUDGET_XP
            })

            result_ucb_opti = pool.apply_async(seq_scout, (data, target), {
                'enable_i': enable_i,
                'time_budget': TIME_BUDGET_XP
            })

            results_misere = results_misere.get()
            results_beam = results_beam.get()
            result_ucb_opti = result_ucb_opti.get()

            if len(results_beam) < TOP_K:
                print("Too few beam: {}".format(len(results_beam)))
            if len(result_ucb_opti) < TOP_K:
                print("Too few seqscout: {}".format(len(result_ucb_opti)))
            if len(results_misere) < TOP_K:
                print("Too few misere: {}".format(len(results_misere)))

            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_misere)),
                             size=size,
                             Algorithm='misere')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(results_beam)),
                             size=size,
                             Algorithm='beam')
            data_add_generic(data_final,
                             WRAcc=max(0, average_results(result_ucb_opti)),
                             size=size,
                             Algorithm='seqscout')

        size += size_step

    df = pd.DataFrame(data=data_final)

    sns.set(rc={'figure.figsize': (8, 6.5)})

    plt.clf()
    ax = sns.lineplot(data=df, x='size', y='WRAcc', hue='Algorithm')
    ax.set(xlabel='Length max', ylabel='WRAcc')

    # ax.set(xlabel='Time(s)', ylabel='Average WRAcc top-10 patterns')

    plt.savefig('./space_size/over_size.png')
    df.to_pickle('./space_size/result')

    if SHOW:
        plt.show()
Beispiel #28
0
    def validate(self,
                 do_mirroring: bool = True,
                 use_train_mode: bool = False,
                 tiled: bool = True,
                 step: int = 2,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 force_separate_z: bool = None,
                 interpolation_order: int = 3,
                 interpolation_order_z=0):
        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"

        # save whether network is in deep supervision mode or not
        ds = self.network.do_ds
        # disable deep supervision
        self.network.do_ds = False

        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)
        # this is for debug purposes
        my_input_args = {
            'do_mirroring': do_mirroring,
            'use_train_mode': use_train_mode,
            'tiled': tiled,
            'step': step,
            'save_softmax': save_softmax,
            'use_gaussian': use_gaussian,
            'overwrite': overwrite,
            'validation_folder_name': validation_folder_name,
            'debug': debug,
            'all_in_gpu': all_in_gpu,
            'force_separate_z': force_separate_z,
            'interpolation_order': interpolation_order,
            'interpolation_order_z': interpolation_order_z,
        }
        save_json(my_input_args, join(output_folder, "validation_args.json"))

        if do_mirroring:
            if not self.data_aug_params['do_mirror']:
                raise RuntimeError(
                    "We did not train with mirroring so you cannot do inference with mirroring enabled"
                )
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(default_num_threads)
        results = []

        for k in self.dataset_val.keys():
            properties = self.dataset[k]['properties']
            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]

            if overwrite or (not isfile(join(output_folder, fname + ".nii.gz"))) or \
                    (save_softmax and not isfile(join(output_folder, fname + ".npz"))):
                data = np.load(self.dataset[k]['data_file'])['data']

                # concat segmentation of previous step
                seg_from_prev_stage = np.load(
                    join(self.folder_with_segs_from_prev_stage,
                         k + "_segFromPrevStage.npz"))['data'][None]

                print(k, data.shape)
                data[-1][data[-1] == -1] = 0
                data_for_net = np.concatenate(
                    (data[:-1],
                     to_one_hot(seg_from_prev_stage[0],
                                range(1, self.num_classes))))
                softmax_pred = self.predict_preprocessed_data_return_softmax(
                    data_for_net,
                    do_mirroring,
                    1,
                    use_train_mode,
                    1,
                    mirror_axes,
                    tiled,
                    True,
                    step,
                    self.patch_size,
                    use_gaussian=use_gaussian,
                    all_in_gpu=all_in_gpu)

                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in self.transpose_backward])

                if save_softmax:
                    softmax_fname = join(output_folder, fname + ".npz")
                else:
                    softmax_fname = None
                """There is a problem with python process communication that prevents us from communicating obejcts 
                larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
                communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
                enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
                patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
                then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
                filename or np.ndarray and will handle this automatically"""
                if np.prod(softmax_pred.shape) > (
                        2e9 / 4 * 0.85):  # *0.85 just to be save
                    np.save(join(output_folder, fname + ".npy"), softmax_pred)
                    softmax_pred = join(output_folder, fname + ".npy")
                results.append(
                    export_pool.starmap_async(
                        save_segmentation_nifti_from_softmax,
                        ((softmax_pred, join(output_folder,
                                             fname + ".nii.gz"), properties,
                          interpolation_order, None, None, None, softmax_fname,
                          force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]
        self.print_to_log_file("finished prediction")

        # evaluate raw predictions
        self.print_to_log_file("evaluation of raw predictions")
        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(
            pred_gt_tuples,
            labels=list(range(self.num_classes)),
            json_output_file=join(output_folder, "summary.json"),
            json_name=job_name + " val tiled %s" % (str(tiled)),
            json_author="Fabian",
            json_task=task,
            num_threads=default_num_threads)

        # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything
        # except the largest connected component for each class. To see if this improves results, we do this for all
        # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
        # have this applied during inference as well
        self.print_to_log_file("determining postprocessing")
        determine_postprocessing(self.output_folder,
                                 self.gt_niftis_folder,
                                 validation_folder_name,
                                 final_subf_name=validation_folder_name +
                                 "_postprocessed",
                                 debug=debug)
        # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
        # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = join(self.output_folder_base, "gt_niftis")
        maybe_mkdir_p(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            e = None
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError as e:
                    attempts += 1
                    sleep(1)
            if not success:
                print("Could not copy gt nifti file %s into folder %s" %
                      (f, gt_nifti_folder))
                if e is not None:
                    raise e

        # restore network deep supervision mode
        self.network.do_ds = ds
Beispiel #29
0
def mmap_(fn: Callable[[A], B], iter: Iterable[A]) -> List[B]:
    return Pool().map(fn, iter)
Beispiel #30
0
    def post(self):
        try:
            # set up post request parameters
            parser.add_argument('gre')
            parser.add_argument('toefl')
            parser.add_argument('grade')
            parser.add_argument('email')
            parser.add_argument('uid')
            parser.add_argument('work_ex')
            parser.add_argument('lor1')
            parser.add_argument('lor2')
            parser.add_argument('lor3')
            parser.add_argument('lor4')

            # Setup google storage for access
            storage_client = storage.Client.from_service_account_json(
                'key1.json')
            bucket = storage_client.bucket('mf-frontend.appspot.com')

            # Retrieve post request values
            args = parser.parse_args()

            gre = args['gre']
            toefl = args['toefl']
            grade = args['grade']
            email = args['email']
            uid = args['uid']
            work_ex = args['work_ex']
            lor1 = args['lor1']
            lor2 = args['lor2']
            lor3 = args['lor3']
            lor4 = args['lor4']

            # Retrieve resume and convert to image
            blob = bucket.blob('resume/' + uid + '/' + uid + '_resume.pdf')
            blob.download_to_filename('/tmp/resume.pdf')
            doc = fitz.open('/tmp/resume.pdf')
            mat = fitz.Matrix(fitz.Identity)
            resume = doc[0].getPixmap(alpha=False, matrix=mat)
            resume.writePNG("/tmp/resume.png")
            resume = np.array(Image.open('/tmp/resume.png'))
            logging.info("Loaded resume")

            # Retrieve sop and extract text
            blob = bucket.blob('sop/' + uid + '/' + uid + '_sop.pdf')
            blob.download_to_filename('/tmp/sop.pdf')
            doc = fitz.open('/tmp/sop.pdf')
            pages = len(doc)
            sop = ""
            for page in range(pages):
                sop += doc[page].getText()
            logging.info("Loaded sop")

            # Make calls to second backend for all universities
            data = {
                "gre": gre,
                "toefl": toefl,
                "grade": grade,
                "email": email,
                "uid": uid,
                "work_ex": work_ex,
                "lor1": lor1,
                "lor2": lor2,
                "lor3": lor3,
                "lor4": lor4
            }
            data["resume"] = resume.tolist()
            data["sop"] = sop
            universities = ["mit", "neu", "ncsu", "utd", "usc"]

            # Parallel requests
            pool = Pool(len(universities))
            async_result = [
                pool.apply_async(self.send_requests, (
                    data,
                    univ,
                )) for univ in universities
            ]
            pool.close()
            pool.join()
            return_val = sorted([ar.get() for ar in async_result],
                                key=itemgetter('score'),
                                reverse=True)
            print(return_val)
            resp = {}
            for d in return_val:
                univ = d['univ']
                nd = copy.deepcopy(data)
                nd['score'] = d['score']
                resp[univ] = nd

            # rets = [self.send_requests(data,x,) for x in universities]
            # resp = {}
            # for i,u in enumerate(universities):
            # resp[u] = rets[i]
            return resp
        except Exception as e:
            print(e)
            return {}