Example #1
0
File: utils.py Project: asuhag/HINT
def measure_local_accuracy(model, number_of_core_samples, step_size, name, output_path):
  """
  Computes the mixed derivative for each sample, using finite differences mathod
  
  :param model: The imported model module
  :param data: The sampled data in structured form
  :param step_size: The dx time step taken between each 
  :returns: hessian matrix, with the core sample index as rows and feature pair as column name
  """
  feature_vectors = pd.DataFrame(np.load('{}/feature_vectors_{}_{}_{}.npy'.format(output_path, number_of_core_samples, step_size, name)), index = np.arange(number_of_core_samples), columns=pd.MultiIndex.from_product([model.perturbation_status_columns, model.feature_names], names=['perturbation_status','features']))
  outputs = pd.DataFrame(np.load('{}/outputs_{}_{}_{}.npy'.format(output_path, number_of_core_samples, step_size, name)), index = np.arange(number_of_core_samples), columns=pd.MultiIndex.from_product([model.output_names, model.perturbation_status_columns_output], names=['outputs','perturbation_status']))
  hessian = calculate_hessian(model, outputs, step_size)
  (centers, magnitudes, dimensions) = model.get_local_ground_truth(output_path,number_of_core_samples, step_size, name)

  core_feature_vectors = feature_vectors.loc[:, 'core']
  
  output_name = model.output_names[0]
  interaction_maps = list(futures.map(functools.partial(create_interaction_map, model, hessian, core_feature_vectors, output_name, 'nearest'), model.feature_pairs))
  local_ranking = list(futures.map(functools.partial(rank_samples_in_pair, model, centers, magnitudes, dimensions), zip(interaction_maps, model.feature_pairs)))
  ranking = np.concatenate(np.array(local_ranking), axis=1)
  accuracies = average_precision_score(ranking[1,:], np.abs(ranking[0,:]))
  ROCs = np.array(precision_recall_curve(ranking[1,:], np.abs(ranking[0,:])))

  pickle.dump(obj = accuracies, file = open('{}/local_accuracies_{}_{}_{}.pickle'.format(output_path,number_of_core_samples, step_size, name),'wb'))
  pickle.dump(obj = ROCs, file = open('{}/local_ROCs_{}_{}_{}.pickle'.format(output_path,number_of_core_samples, step_size, name),'wb'))
  return accuracies
def big_cluster_completeness(inf3, grab, inkey, cluster, cluster_map, tanimoto,
                             c_list, j):
    mx = pd.read_csv(
        inf3, sep=',', header=0, usecols=grab, engine='c'
    )  # loads in only the columns from the grab list, i.e. all cols for a unique cluster
    mx.index = inkey  # reindexes the df with the orf labels after importing specific columns with usecols
    # how many orfs in the full cluster
    j_orfs = len(cluster_map[cluster])
    args_list = [
        mx, j_orfs, cluster_map
    ]  # organizes all the arguments that the parallelized function needs into a list
    if __name__ == '__main__':
        if tanimoto:
            results = list(
                futures.map(partial(parallel_tanimoto, args_list=args_list),
                            c_list))
        else:
            results = list(
                futures.map(partial(parallel_minicluster, args_list=args_list),
                            c_list))
        bigmat = pd.concat(
            results, axis=0
        )  # stack all the results into a single column in a dataframe
        # print(bigmat.shape[0])
        bigmat.index = c_list  # now the index is just the clusters, not the orfs
    # DEBUG - will print the progress every 50 clusters (across columns--the slower dimension).
    if j % 5:
        pass
    elif j == 0:
        print('Processed first cluster... moving on!')
    else:
        print('Processed %d clusters' % j)
    del mx
    return bigmat
def main():
	parser = make_arg_parser()
	args = parser.parse_args()
	# Parse command line
	tanimoto = args.tanimoto
	with open(args.mpfa, 'r') as inf:
		# Generates dictionary with each unique 'refseq_cluster' as keys, ORFs as values
		cluster_map = build_cluster_map(inf, bread=args.bread)
	with open(args.input, 'r') as inf2:
		inkey = generate_index_list(inf2)
		print('\nOk, processing input file...\n')
	with open(args.input, 'r') as in_csv2:
		headers = generate_chunk_list(in_csv2)
	c_list = list(cluster_map.keys())
	grabbed_clusters = []
	data_to_pool = []
	# print(c_list)
	for cluster in c_list:
		grab = pick_a_cluster(headers, cluster)  # uses the name of the cluster to get a list of all orfs for a particular unique cluster
		# print(grab)
		if not grab:
			pass
		else:
			# print(grab)
			grabbed_clusters.extend([cluster])
			with open(args.input, 'r') as inf3:
				mx = pd.read_csv(inf3, sep=',', header=0, usecols=grab, engine='c')  # loads in only the columns from the grab list, i.e. all cols for a unique cluster
			mx.index = inkey  # reindexes the df with the orf labels after importing specific columns with usecols
			data_to_pool.append(mx)
	dlen = len(data_to_pool)
	print('Built the data list of %s clusters' % dlen)
	args_list = [cluster_map, c_list]  # organizes all the arguments that the parallelized function needs into a list
	print('\nSending data to Workers... work, Workers, work!\n')
	if args.tanimoto:
		if __name__ == '__main__':
			results = list(futures.map(partial(parallel_tanimoto, args_list=args_list), data_to_pool))
			outdf = pd.concat(results, axis=1)
	if not args.tanimoto:
		if __name__ == '__main__':
			results = list(futures.map(partial(parallel_minicluster, args_list=args_list), data_to_pool))
			outdf = pd.concat(results, axis=1)
		# bigmat = pd.concat(results, axis=0)  # stack all the results into a single column in a dataframe
		# print(bigmat.shape[0])
		# bigmat.index = c_list  # now the index is just the clusters, not the orfs
		# print(bigmat)
	print('File processing complete; writing output file...\n')
	del data_to_pool
	with open(args.output, 'w') if args.output != '-' else sys.stdout as outf:
		# outdf = pd.concat(results, axis=1)
		outdf.columns = grabbed_clusters  # names the columns (and index, next line) according to clusters in the order they were processed
		outdf.index = c_list
		outdf.sort_index(axis=0, inplace=True)
		outdf.sort_index(axis=1, inplace=True)
		outdf = outdf.round(decimals=3)
		outdf.to_csv(outf)
Example #4
0
    def set_params(self, **params):
        K.clear_session()
        map(clear_session, [10] * 10)
        self.check_data_params()
        data_params = self.data_params
        data_params['lags'] = params['input_size']
        self.set_data_params(**data_params)
        params['num_inputs'] = len(self.data_params['vars'][0])
        params['num_outputs'] = len(self.data_params['vars'][1])
        # params['input_size'] = self.data_params['lags']

        super(ForecastRegressor, self).set_params(**params)
Example #5
0
def run_coarse_grained_ga(population_size, deme_size, chromosome_size,
                          number_of_generations, neighbourhood_size,
                          server_ip_addr, server_user, server_password,
                          num_of_migrants, fitness):
    ins = CoarseGrainedBase(population_size=population_size,
                            deme_size=deme_size,
                            chromosome_size=chromosome_size,
                            number_of_generations=number_of_generations,
                            neighbourhood_size=neighbourhood_size,
                            server_ip_addr=server_ip_addr,
                            server_user=server_user,
                            server_password=server_password,
                            num_of_migrants=num_of_migrants,
                            fitness=fitness)

    populations = ins.initialize_population(deme_size)
    print(str(populations))
    channels = ins.initialize_topology()
    results = list(futures.map(ins, populations, channels))
    dct = {}
    for data in results:
        best_chromosome = data.pop(0)
        fitness_val = best_chromosome.fit
        vector = best_chromosome.chromosome
        dct[fitness_val] = vector
    logger.info("END RESULT" + str(sorted(dct.items()).pop()))
Example #6
0
    def _send_individuals_reproduce(self):
        """
        Select individuals for reproduction with probability
        based on fitness value. Weak individuals are removed
        and replaced with newly generated ones.
        """

        # retrieve best fitness of population
        results = list(futures.map(self._fitness, self._population))
        neighbours = self._Individuals()
        for i in range(0, self._population_size):
            fit_val = results.pop(0)
            chromosome = self._population[i]
            neighbours.append_object(self._Individual(fit_val, chromosome))

        chosen_individuals = self._choose_individuals_based_on_fitness(
            neighbours)
        chromosomes_reproducing = chosen_individuals.sort_objects()
        best_individual = chosen_individuals.best_individual

        # it is sure that this is the right result
        # but the algorithm needs to continue because of other demes
        if best_individual is not None:
            while len(self._population) <= self._population_size:
                self._population.append(best_individual.chromosome)
            return

        best_individual = chromosomes_reproducing.pop(0)
        # remove old population
        del self._population[:]
        logger.info("Number of individuals chosen for reproduction is " +
                    str(len(chromosomes_reproducing))+ " while best individuals has fitness "+
                    str(best_individual.fit))
        # Reproducing requires two individuals.
        # If number of selected individuals is even
        # put the best individual to the new population.
        # Otherwise, put him to individuals dedicated
        # for reproduction
        if len(chromosomes_reproducing) % 2 == 0:
            self._population.append(best_individual.chromosome)
        else:
            # put the best individual to max index in order to not rewrite existing
            chromosomes_reproducing.append(best_individual)
        # randomly choose pairs for crossover
        # then mutate new individuals and put them to new population
        while len(chromosomes_reproducing) >= 2:
            father = chromosomes_reproducing.pop(random.randrange(len(
                chromosomes_reproducing))).chromosome
            mother = chromosomes_reproducing.pop(random.randrange(len(
                chromosomes_reproducing))).chromosome
            self._crossover(father, mother)
            # mutate
            self._mutation(father)
            self._mutation(mother)
            self._population.append(father)
            self._population.append(mother)

        # Generate new individuals in order to make new population the same size
        while len(self._population) != self._population_size:
            self._population.append(self._gen_individual())
def multiple_runs_mean(nb_runs):
    generations = None
    all_fit_mins, all_fit_avg, all_duration_mins, all_duration_maxs = [], [], [], []

    # The next two lines are for sequential runs, comment them out when using parallel runs
    # for i in range(1, nb_runs + 1):
    #     gen, fit_mins, fit_avg, duration_mins, duration_maxs = single_run(i)
    # The next two lines are for parallel runs, comment them out when using sequential runs
    runs_results = futures.map(single_run, range(1, nb_runs + 1))
    for gen, fit_mins, fit_avg, duration_mins, duration_maxs in runs_results:
        if generations == None:
            generations = gen
        all_fit_mins.append(fit_mins)
        all_fit_avg.append(fit_avg)
        all_duration_mins.append(duration_mins)
        all_duration_maxs.append(duration_maxs)

    def mean_values(all_values):
        return [sum(x) / nb_runs for x in zip(*all_values)]

    mean_fit_mins = mean_values(all_fit_mins)
    mean_fit_avg = mean_values(all_fit_avg)
    mean_duration_mins = mean_values(all_duration_mins)
    mean_duration_maxs = mean_values(all_duration_maxs)

    return nb_runs, generations, mean_fit_mins, mean_fit_avg, mean_duration_mins, mean_duration_maxs
Example #8
0
 def aimFunc(self, pop):  # 目标函数
     Vars = pop.Phen  # 得到决策变量矩阵
     args = list(
         zip(list(range(pop.sizes)), [Vars] * pop.sizes,
             [self.data] * pop.sizes, [self.dataTarget] * pop.sizes))
     pop.ObjV = np.array(list(futures.map(
         subAimFunc, args)))  # 调用SCOOP的map函数进行分布式计算,并构造种群所有个体的目标函数值矩阵ObjV
Example #9
0
def find_optimum(data_l, FEATS, method, n_neighbors=8):
    from functools import partial
    rmse_l = []
    r2_l = []
    res_l = []
    for _i in range(1, len(FEATS) + 1):
        sel_feat = FEATS[:_i]
        gexp = data_l[-1].loc[:, sel_feat]
        DATA_l = [data_l[0], data_l[1], gexp]
        sm = partial(select_model,
                     data_l=DATA_l,
                     FEATS=sel_feat,
                     method=method,
                     n_neighbors=n_neighbors)
        df_l = list(futures.map(sm, range(20)))
        Act = df_l[0].Actual
        meanPred = np.mean(np.vstack(
            [df.Predicted.loc[Act.index].values for df in df_l]),
                           axis=0)
        Pred = pa.Series(meanPred, df_l[0].Actual.index)
        rmse, r2, __, __ = do_ols(Act, Pred)
        r2_l.append(r2)
        rmse_l.append(rmse)
        res_l.append(pa.DataFrame({'Actual': Act, 'Predicted': Pred}))
    ii = np.argmax(r2_l)
    print method, ii, r2_l[ii]
    return r2_l, rmse_l, res_l
Example #10
0
 def evaluate_parallel(invalid_pops):
     """Evaluate model by SCOOP or map, and set fitness of individuals
      according to calibration step."""
     popnum = len(invalid_pops)
     labels = list()
     try:  # parallel on multi-processors or clusters using SCOOP
         from scoop import futures
         invalid_pops = list(futures.map(toolbox.evaluate, [cali_obj] * popnum, invalid_pops))
     except ImportError or ImportWarning:  # Python build-in map (serial)
         invalid_pops = list(toolbox.map(toolbox.evaluate, [cali_obj] * popnum, invalid_pops))
     for tmpind in invalid_pops:
         if step == 'Q':  # Step 1 Calibrating discharge
             tmpind.fitness.values, labels = tmpind.cali.efficiency_values('Q', object_names)
         elif step == 'SED':  # Step 2 Calibrating sediment
             sedobjvs, labels = tmpind.cali.efficiency_values('SED', object_names)
             qobjvs, qobjlabels = ind.cali.efficiency_values('Q', object_names)
             labels += [qobjlabels[0]]
             sedobjvs += [qobjvs[0]]
             tmpind.fitness.values = sedobjvs[:]
         elif step == 'NUTRIENT':  # Step 3 Calibrating NUTRIENT,TN,TP
             tnobjvs, tnobjlabels = tmpind.cali.efficiency_values('CH_TN', object_names)
             tpobjvs, tpobjlabels = tmpind.cali.efficiency_values('CH_TP', object_names)
             qobjvs, qobjlabels = ind.cali.efficiency_values('Q', object_names)
             sedobjvs, sedobjlabels = tmpind.cali.efficiency_values('SED', object_names)
             objvs = [tnobjvs[0], tpobjvs[0], qobjvs[0], sedobjvs[0]]
             labels = [tnobjlabels[0], tpobjlabels[0], qobjlabels[0], sedobjlabels[0]]
             tmpind.fitness.values = objvs[:]
     # NSE > 0 is the preliminary condition to be a valid solution!
     if filter_NSE:
         invalid_pops = [tmpind for tmpind in invalid_pops if tmpind.fitness.values[0] > 0]
         if len(invalid_pops) < 2:
             print('The initial population should be greater or equal than 2. '
                   'Please check the parameters ranges or change the sampling strategy!')
             exit(0)
     return invalid_pops, labels  # Currently, `invalid_pops` contains evaluated individuals
Example #11
0
def recursiveFunc(level):
    if level == 0:
        return 1
    else:
        args = [level-1] * 2
        s = sum(futures.map(recursiveFunc, args))
        return s
Example #12
0
    def evaluate_losses(self, num_runs):
        variables = self.variables
        pred_keys = ['train pred', 'val pred', 'test pred']
        fcast_keys = ['train fcast', 'val fcast', 'test fcast']
        sets = ['train', 'val', 'test']

        result = OrderedDict()

        for pred_key, fcast_key, set in zip(pred_keys, fcast_keys, sets):
            pred_fcast = list(map(self._evaluate_losses, [set] * num_runs))
            pred_res = [i[0] for i in pred_fcast]
            fcast_res = [i[1] for i in pred_fcast]

            # pred_res = list(map(self.evaluate_prediction, [set] * num_runs, [True] * num_runs))
            result[pred_key] = np.mean(np.squeeze(pred_res), axis=0)

            # fcast_res = list(map(self.evaluate_forecast, [set] * num_runs, [True] * num_runs))
            result[fcast_key] = np.mean(np.squeeze(fcast_res), axis=0)

            # K.clear_session()

        if self.is_multioutput:
            result = pd.DataFrame(result, index=['total'] + variables)
        else:
            result = pd.DataFrame(result, index=variables)

        return result
def best_feat(L, ii, name='', p=0.1):
    print "Finding best feature."
    eff_d = {}
    lstsq_d = {}
    dummy = 100
    SC_GROWTH = np.log(2) / 1.5
    DS_BINS = np.linspace(0.98, 1.02, 5) * SC_GROWTH
    for feat, df, bins, eff in L:
        if 'noise' in name:
            col = 'Predicted'
        elif name.startswith('slavov-holstege'):
            col = 'Downsampled'
        else:
            col = 'Noiseless'
        norm = partial(compare_columns, df=df, dnsamp_bins=DS_BINS, column=col)
        D_l = list(futures.map(norm, range(dummy)))
        Dmu = np.mean(D_l)
        #score = D+eff*p
        eff_d[feat] = eff
        lstsq_d[feat] = Dmu
    eff_ser = pa.Series(eff_d)
    eff_ser.to_pickle('eff_%d_%s.pkl' % (ii + 1, name))
    lstsq_ser = pa.Series(lstsq_d)
    lstsq_ser.to_pickle('lstsq_%d_%s.pkl' % (ii + 1, name))
    score_ser = lstsq_ser + p * eff_ser
    sel = score_ser.idxmin()
    print "The minimum feature at", ii + 1, "features is:", sel, lstsq_ser.loc[
        sel], lstsq_ser.idxmin(), eff_ser.loc[sel]
    ll = filter(lambda l: l[0] == sel, L)
    return ll[0]  #L[sel]
Example #14
0
def main(number):

    random.seed(4)
    N_ISLES = number
    FREQ = 5
    pob = int(500 / number)
    islands = [toolbox.population(n=pob) for i in range(N_ISLES)]

    toolbox.unregister("indices")
    toolbox.unregister("individual")
    toolbox.unregister("population")

    toolbox.register("alg_scoop",
                     algorithms.eaSimple,
                     toolbox=toolbox,
                     cxpb=0.8,
                     mutpb=0.2,
                     ngen=5,
                     verbose=False)

    start_time = time.time()
    for i in range(0, 400, FREQ):
        results = futures.map(toolbox.alg_scoop, islands)
        islands = [pop for pop, logbook in results]
        tools.migRing(islands, 15, tools.selBest)

    print("--- %s seconds ---" % (time.time() - start_time))
    return "finished"
Example #15
0
def maxTreeDepthDivide(rootValue, currentDepth=0, parallelLevel=2):
    """Finds a tree node that represents rootValue and computes the max depth
       of this tree branch.
       This function will emit new futures until currentDepth=parallelLevel"""
    thisRoot = shared.getConst('myTree').search(rootValue)
    if currentDepth >= parallelLevel:
        return thisRoot.maxDepth(currentDepth)
    else:
        # Base case
        if not any([thisRoot.left, thisRoot.right]):
            return currentDepth
        if not all([thisRoot.left, thisRoot.right]):
            return thisRoot.maxDepth(currentDepth)

        # Parallel recursion
        return max(
            futures.map(
                maxTreeDepthDivide,
                [
                    thisRoot.left.payload,
                    thisRoot.right.payload,
                ],
                cycle([currentDepth + 1]),
                cycle([parallelLevel]),
            )
        )
Example #16
0
    def evaluate_parallel(invalid_pops):
        """Evaluate model by SCOOP or map, and get fitness of individuals."""
        popnum = len(invalid_pops)
        try:
            # parallel on multiprocesor or clusters using SCOOP
            from scoop import futures
            invalid_pops = list(
                futures.map(toolbox.evaluate, [sceobj.cfg] * popnum,
                            invalid_pops))
        except ImportError or ImportWarning:
            # serial
            invalid_pops = list(
                toolbox.map(toolbox.evaluate, [sceobj.cfg] * popnum,
                            invalid_pops))

        # Filter for a valid solution
        if filter_ind:
            invalid_pops = [
                tmpind for tmpind in invalid_pops
                if check_validation(tmpind.fitness.values)
            ]
            if len(invalid_pops) < 2:
                print(
                    'The initial population should be greater or equal than 2. '
                    'Please check the parameters ranges or change the sampling strategy!'
                )
                exit(2)
        return invalid_pops  # Currently, `invalid_pops` contains evaluated individuals
Example #17
0
def main_pso():
    pop = toolbox.population(n=1000)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", numpy.mean)
    stats.register("std", numpy.std)
    stats.register("min", numpy.min)
    stats.register("max", numpy.max)

    logbook = tools.Logbook()
    logbook.header = ["gen", "evals"] + stats.fields

    GEN = 10
    best = None

    for g in range(GEN):
        fitnesses = list(futures.map(toolbox.evaluate, pop))
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit
        for part in pop:
            if not part.best or part.best.fitness < part.fitness:
                part.best = creator.Particle(part)
                part.best.fitness.values = part.fitness.values
            if not best or best.fitness < part.fitness:
                best = creator.Particle(part)
                best.fitness.values = part.fitness.values
        for part in pop:
            toolbox.update(part, best)
        #Gather all the fitnesses in one list and print the stats
        logbook.record(gen=g, evals=len(pop), **stats.compile(pop))
        print("generation: %i" % g)
        #print(logbook.stream)


#
    return pop, logbook, best
Example #18
0
File: utils.py Project: asuhag/HINT
def rank_global(model, number_of_core_samples, step_size, name, output_path, top_k_to_plot):
  """
  Computes the mixed derivative for each sample, using finite differences mathod
  
  :param model: The imported model module
  :param data: The sampled data in structured form
  :param step_size: The dx time step taken between each 
  :returns: hessian matrix, with the core sample index as rows and feature pair as column name
  """
  outputs = pd.DataFrame(np.load('{}/outputs_{}_{}_{}.npy'.format(output_path, number_of_core_samples, step_size, name)), index = np.arange(number_of_core_samples), columns=pd.MultiIndex.from_product([model.output_names, model.perturbation_status_columns], names=['outputs','perturbation_status']))
  outputs = normalize_outputs(model, outputs)
  hessian = calculate_hessian(model, outputs, step_size)
  hessian = denoise_hessian(hessian)
  ranked_hessian = hessian.abs().mean(axis=0)
  ranking = []
  for output_name in model.output_names:
    sorted_pairs = ranked_hessian.loc[output_name].loc[model.normalization_feature_pairs].sort_values()[::-1]
    ranking.append((output_name, list(sorted_pairs.index), sorted_pairs.values))
  if top_k_to_plot:
    feature_vectors = pd.DataFrame(np.load('{}/feature_vectors_{}_{}_{}.npy'.format(output_path, number_of_core_samples, step_size, name)), index = np.arange(number_of_core_samples), columns=pd.MultiIndex.from_product([model.perturbation_status_columns, model.feature_names], names=['perturbation_status','features']))
    core_feature_vectors = feature_vectors.loc[:, 'core'].copy()
    core_feature_vectors = normalize_inputs(model, core_feature_vectors)
    interaction_maps = list(futures.map(functools.partial(create_interaction_map, model, hessian, core_feature_vectors, output_name, 'linear'), model.feature_pairs))
    ranked_feature_pairs = np.array(ranking)[:, 1][0][:top_k_to_plot]
    for pair_name in ranked_feature_pairs:
      ind = model.feature_pairs.index(pair_name)
      first_variable, second_variable = model.feature_pairs[ind].split(' and ')
      most_nonlinear_sample = hessian[output_name][model.feature_pairs[ind]].abs().idxmax()
      y_coord = 100 * (feature_vectors.loc[most_nonlinear_sample, 'core'][first_variable] - model.feature_limits[first_variable][0]) / (model.feature_limits[first_variable][1] - model.feature_limits[first_variable][0])
      x_coord = 100 * (feature_vectors.loc[most_nonlinear_sample, 'core'][second_variable] - model.feature_limits[second_variable][0]) / (model.feature_limits[second_variable][1] - model.feature_limits[second_variable][0])
      plot_interaction_map(model, name, interaction_maps[ind], output_name, first_variable, second_variable, x_coord, y_coord, output_path)

  pickle.dump(obj = ranking, file = open('{}/global_ranking_{}_{}_{}.pickle'.format(output_path,number_of_core_samples, step_size, name),'wb'))
  return ranking
Example #19
0
def maxTreeDepthDivide(rootValue, currentDepth=0, parallelLevel=2):
    """Finds a tree node that represents rootValue and computes the max depth
       of this tree branch.
       This function will emit new futures until currentDepth=parallelLevel"""
    thisRoot = shared.getConst('myTree').search(rootValue)
    if currentDepth >= parallelLevel:
        return thisRoot.maxDepth(currentDepth)
    else:
        # Base case
        if not any([thisRoot.left, thisRoot.right]):
            return currentDepth
        if not all([thisRoot.left, thisRoot.right]):
            return thisRoot.maxDepth(currentDepth)

        # Parallel recursion
        return max(
            futures.map(
                maxTreeDepthDivide,
                [
                    thisRoot.left.payload,
                    thisRoot.right.payload,
                ],
                cycle([currentDepth + 1]),
                cycle([parallelLevel]),
            ))
Example #20
0
def run_fine_grained_ga(population_size,
                        chromosome_size,
                        number_of_generations,
                        neighbourhood_size,
                        server_ip_addr,
                        fitness,
                        server_user,
                        server_password,
                        mate_best_neighbouring_individual=True):
    ins = FineGrainedBase(
        population_size=population_size,
        chromosome_size=chromosome_size,
        number_of_generations=number_of_generations,
        neighbourhood_size=neighbourhood_size,
        server_ip_addr=server_ip_addr,
        fitness=fitness,
        mate_best_neighbouring_individual=mate_best_neighbouring_individual,
        server_user=server_user,
        server_password=server_password)
    populations = ins.initialize_population(None)
    channels = ins.initialize_topology()
    result = list(futures.map(ins, populations, channels))
    dct = {}

    while len(result):
        fitness_val, vector = result.pop(0)
        dct[fitness_val] = vector
    logger.info("END RESULT " + str(sorted(dct.items()).pop()))
Example #21
0
def funcLambdaSubfuncNotGlobal(n):
    """Tests a lambda function containing a call to a function that is not in
    the globals()."""
    my_mul = operator.mul
    lambda_func = lambda x : my_mul(x, x)
    result = list(futures.map(lambda_func, [i+1 for i in range(n)]))
    return sum(result)
def sel_pairs_predictions(feat_l,
                          bins_l,
                          kind='feat',
                          recalc=False,
                          err=.1,
                          useNoise=True,
                          noiseFolds=4,
                          ext='',
                          filt_thr=1e-6,
                          testAll=True,
                          n_neighbors=7):
    l = evecs.columns.tolist()  #pa.read_pickle('sacCer_%s_l.pkl' % kind)
    L = list(set(l) - set(feat_l))
    if not testAll:
        L = filter(lambda x: float(x) > filt_thr, L)
    test_genes = sorted(L, key=lambda l: float(l), reverse=True)
    LOL = [
        feat_l + [g]
        for g in sorted(test_genes, key=lambda x: float(x), reverse=True)
    ]
    NN = True if kind.startswith('feat') else False
    p_cvp = partial(cv_mc,
                    corr=NN,
                    all_bins=bins_l,
                    recalc=recalc,
                    gn_err=err,
                    useNoise=True,
                    noiseFolds=4,
                    ext=ext,
                    n_neighbors=n_neighbors)
    L = list(futures.map(p_cvp, LOL))
    return L
Example #23
0
def write_patient_data(pdata):

    output_dir = os.path.join(
        utils.output_directory, 'p{0}{1}'.format(pdata['patient_id'] + 1,
                                                 pdata['dtype']))

    create_dir_if_not_exist(output_dir)
    raw_data = list(futures.map(create_spectrogram_images, pdata['mat_file']))

    raw_spectrograms = []
    std_spectrograms = []

    for el in raw_data:
        raw_spectrograms.append(el[:, 0, :, :])
        std_spectrograms.append(el[:, 1, :, :])

    print np.array(raw_spectrograms).shape
    mat_file_names = map(basename, pdata['mat_file'])
    input_data = {
        'raw_spectrograms': np.array(raw_spectrograms),
        'std_spectrograms': np.array(std_spectrograms),
        'file_name': np.array(mat_file_names),
        'segment': pdata['segment']
    }

    if pdata['dtype'] == 'train':
        input_data['target'] = pdata['target']

    np.save(os.path.join(output_dir, 'data.npy'), input_data)
Example #24
0
def funcLambdaSubfuncNotGlobal(n):
    """Tests a lambda function containing a call to a function that is not in
    the globals()."""
    my_mul = operator.mul
    lambda_func = lambda x : my_mul(x, x)
    result = list(futures.map(lambda_func, [i+1 for i in range(n)]))
    return sum(result)
Example #25
0
def average_results(lookahead, N):
    results = futures.map(do_individual_experiment,
                          [(lookahead, seed) for seed in range(N)])
    results = list(results)
    avg_steps = sum(steps for steps, _ in results) / N
    avg_elapsed = sum(elapsed for _, elapsed in results) / N
    return avg_steps, avg_elapsed
Example #26
0
    def do_noise_simulation(self,
                            variable_name,
                            range_,
                            steps,
                            sigma,
                            set_variables=list()):
        v = range_[0] + (np.arange(
            0, steps, 1)) / (steps - 1.0) * (range_[1] - range_[0])
        self.duration = 100.0
        self.doPreRun = False
        fqs = list()
        fl_phase_durs = list()
        ex_phase_durs = list()
        phases = list()
        gaits = list()
        paras = futures.map(
            functools.partial(self.do_noise_iteration,
                              variable_name=variable_name,
                              sigma=sigma,
                              set_variables=set_variables), v)
        for i, (fq, fl, ex, ph, g) in enumerate(paras):
            fqs.append(fq)
            fl_phase_durs.append(fl)
            ex_phase_durs.append(ex)
            phases.append(ph)
            gaits.append(g)

        return (v, fqs, fl_phase_durs, ex_phase_durs, phases, gaits)
Example #27
0
 def evalVars(self, Vars):  # 目标函数
     N = Vars.shape[0]
     args = list(
         zip(list(range(N)), [Vars] * N, [self.data] * N,
             [self.dataTarget] * N))
     ObjV = np.array(list(futures.map(
         subEvalVars, args)))  # 调用SCOOP的map函数进行分布式计算,并构造目标函数值矩阵ObjV
     return ObjV
Example #28
0
def calcPi(workers, tries):
    bt = time()
    expr = futures.map(test, [tries] * workers)
    piValue = 4. * sum(expr) / float(workers * tries)
    totalTime = time() - bt
    print("pi = " + str(piValue))
    print("total time: " + str(totalTime))
    return piValue
Example #29
0
def main2(n):
  # This call results in a generator function
  result = futures.map(func4, [i+1 for i in range(n)])
  print result
  # The results are evaluated here when they are accessed.
  d = sum(result)
  print d
  return d
Example #30
0
def calcPi(workers, tries):
    bt = time()
    expr = futures.map(test, [tries] * workers)
    piValue = 4. * sum(expr) / float(workers * tries)
    totalTime = time() - bt
    print("pi = " + str(piValue))
    print("total time: " + str(totalTime))
    return (piValue, totalTime)
Example #31
0
File: utils.py Project: asuhag/HINT
def get_max_filters(matrix, num_filters = 100, threshold = 3):
  matrix_size = matrix.shape[0]
  filter_sizes = np.linspace(5, matrix_size, num_filters).astype(int)
  filter_results = list(futures.map(functools.partial(max_filter_activation, np.abs(matrix)), filter_sizes))
  if len(np.where(np.array(filter_results) >= threshold)[0]) == 0:
    return -1
  else:
    return np.where(np.array(filter_results) < threshold)[0][0] - 1
Example #32
0
def func1(n):
    try:
        # The map alone doesn't throw the exception. The exception is raised
        # in the sum which calls the map generator.
        result = sum(futures.map(func2, [i+1 for i in range(n)]))
    except Exception as err:
        # We could do some stuff here
        raise Exception("This exception is normal")
    return result
Example #33
0
def main():
    # Create object instances
    myInstances = [myClass() for _ in range(20)]
    # Modify them parallely
    myAnswers = list(futures.map(modifyClass, myInstances))

    # Each result is a new object with the modifications applied
    print(myAnswers)
    print([a.myVar for a in myAnswers])
Example #34
0
def updateParticle(part, best, phi1, phi2):
    u1 = (random.uniform(0, phi1) for _ in range(len(part)))
    u2 = (random.uniform(0, phi2) for _ in range(len(part)))
    v_u1 = futures.map(operator.mul, u1, map(operator.sub, part.best, part))
    v_u2 = futures.map(operator.mul, u2, map(operator.sub, best, part))
    part.speed = list(
        futures.map(operator.add, part.speed, map(operator.add, v_u1, v_u2)))
    for i, speed in enumerate(part.speed):
        if speed < part.smin:
            part.speed[i] = part.smin
        elif speed > part.smax:
            part.speed[i] = part.smax
    part[:] = list(map(operator.add, part, part.speed))
    for (ind, v) in enumerate(part[:]):
        if v < part.pmin:
            part[ind] = part.pmin
        if v > part.pmax:
            part[ind] = part.pmax
Example #35
0
def eval_fun(pop, *args):
    # call each particle in parallel
    bdat = args[1]
    pnames = args[2]
    likes = list(
        futures.map(eval_mod, [indiv for indiv in pop], [pnames] * len(pop),
                    [bdat] * len(pop)))

    return np.array(likes)
Example #36
0
def func3(n):
    result = []
    try:
        result = list(futures.map(func4, [i+1 for i in range(n)]))
    except Exception as e:
        # We return what we can
        return e.args[0] + sum(result)
    # No exception was generated
    return sum(result)
Example #37
0
def func3(n):
    result = []
    try:
        result = list(futures.map(func4, [i + 1 for i in range(n)]))
    except Exception as e:
        # We return what we can
        return e.args[0] + sum(result)
    # No exception was generated
    return sum(result)
Example #38
0
def func1(n):
    try:
        # The map alone doesn't throw the exception. The exception is raised
        # in the sum which calls the map generator.
        result = sum(futures.map(func2, [i + 1 for i in range(n)]))
    except Exception as err:
        # We could do some stuff here
        raise Exception("This exception is normal")
    return result
Example #39
0
    def _predict(self,xte,mode,svmList,labels):
        ypred2 = list( fu.map(self._predict2,
                              svmList,[xte]*len(svmList)) )

        ypred3 = [] # ypred merged from all classifiers
        for i in range(len(xte)):# for each member/sample of the vector xte
            ypred3i = [ypred2[j][i] for j in range(len(svmList))]# of each sample from all classifiers
            ypred3i = self._merge(ypred3i,mode,labels)
            ypred3.append(ypred3i)

        return ypred3
    def get_universe(self):
        # Download historical data for our universe
        key_value_pairs = list(futures.map(self.get_historical, self.symbols))
        #key_value_pairs = map(self.get_historical, self.symbols)

        #key_value_pairs.remove((None, None)) # remove any failed items
        if (None, None) in key_value_pairs:
            key_value_pairs.remove((None, None)) # remove any failed items

        self.hist       = dict(key_value_pairs)
        if None in self.hist:
            del self.hist[None]
Example #41
0
    def fit(self,ixtr,iytr):
        xyTrList = cutil.divideSamples(ixtr,iytr,self._maxTrainingSamplesPerBatch)
        if self._maxNumberOfTrainingBatches != 0:
            xyTrList = xyTrList[0:self._maxNumberOfTrainingBatches]

        self._svmList = list( fu.map(self._fit,
                                     [xytr[0] for xytr in xyTrList],
                                     [xytr[1] for xytr in xyTrList]) )
        assert len(self._svmList)!=0,'empty _svmList in fit()'

        self._labels = self._svmList[0][0].classes_.tolist()
        for svm in self._svmList: assert svm[0].classes_.tolist()==self._labels
Example #42
0
File: GA.py Project: lisabang/iqsar
    def evolveparallel(self):
        from scoop import futures
        toolbox.register("genind", self.mkeind,self.indsize)
        toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.genind)
        toolbox.register("population",tools.initRepeat, list, toolbox.individual, n=self.popsize)

        toolbox.register("evaluate", self.evalr2)
        toolbox.register("mate", tools.cxOnePoint) #Uniform, indpb=0.5)
        toolbox.register("mutate", self.mutaRan)#, indpb=self.mut)
        toolbox.register("select", tools.selBest)
        toolbox.register("map", pool.map)
        population=toolbox.population()
        
        #print population
        fits=toolbox.map(toolbox.evaluate, population)

        for fit, ind in zip(fits,population):
            ind.fitness.values=fit
            #print fit, ind
            #print fit
        #offspring=algorithms.varOr(population, toolbox, lambda_=100, cxpb=.5, mutpb=.05)    
        #print toolbox.map(toolbox.evaluate, offspring)
        
        avgfitnesses=[]
        for gen in range(self.ngen):
            
            offspring=algorithms.varOr(population, toolbox, lambda_=self.popsize, cxpb=self.cx, mutpb=self.mut)   
            #print "offspring",offspring
            fits=futures.map(toolbox.evaluate, offspring)
            for fit, ind in zip(fits,population):
                ind.fitness.values=fit
            
#            for ind in offspring:
#                ind.fitness.values=toolbox.evaluate(ind)


            
            population=toolbox.select([k for k,v in itert.groupby(sorted(offspring+population))], k=100)
            popfits = futures.map(toolbox.evaluate, population)
            avgfitnesses.append(np.mean(popfits))
Example #43
0
    def predict(self,ixte):
        assert len(self._svmList)!=0,'empty _svmList in predict()'
        xyTeList = cutil.divideSamples(ixte,None,self._maxTestingSamplesPerBatch)
        xTeList = [i[0] for i in xyTeList]; n = len(xTeList)
        ypredList = list( fu.map(self._predict,
                                 xTeList,[self._mode]*n,[self._svmList]*n,[self._labels]*n) )

        ypredMerged = []; yscoreMerged = [];
        for i in ypredList:
            ypredMerged += [j[0] for j in i]
            yscoreMerged += [j[1] for j in i]
        assert len(ypredMerged)==len(ixte),str(len(ypredMerged))+'!='+str(len(ixte))
        return (ypredMerged,yscoreMerged)
Example #44
0
def main(argv):
    if len(argv)!=3:
        print 'USAGE: python devel.py [dataMode] [valMode]'
        return

    dataMode = argv[1]
    valMode = argv[2]

    # load development dataset, containing com-pro connectivity
    connMat,comList,proList = yam.loadComProConnMat(dataMode)
    kernel = yam.loadKernel(dataMode)

    ##
    dataX = []
    dataY = []
    for i,ii in enumerate(comList):
        for j,jj in enumerate(proList):
            dataX.append( (ii,jj) )
            dataY.append( connMat[i][j] )
    nData = len(dataY)

    ##
    nFolds = None
    kfList = None
    if valMode=='loocv':
        nFolds = nData
        kfList = KFold(nData, n_folds=nFolds, shuffle=True)
    elif valMode=='kfcv':
        nFolds = 10
        kfList = StratifiedKFold(dataY, n_folds=nFolds, shuffle=True)
    else:
        assert(False)

    kronrls = KronRLS(connMat,comList,proList,kernel)

    ## prep for parallel
    xTestList = []
    yTestList = []
    for trIdxList, testIdxList in kfList:
        xTest = [dataX[i] for i in testIdxList]
        yTest = [dataY[i] for i in testIdxList]

        xTestList.append(xTest)
        yTestList.append(yTest)

    ##
    yPredList = fu.map(evalPerFold,xTestList,yTestList,[kronrls]*nFolds,
                       [connMat]*nFolds,[comList]*nFolds,[proList]*nFolds,[kernel]*nFolds)
def get_data(path):
    file_counter = 0
    tasks = []
    results = []
    data_collector = dict()
    for file_name in os.listdir(path):
        if file_name.startswith("secure"):
            file_counter += 1
            logfile = open("%s/%s" % (path, file_name))
            tasks.append(logfile.read())
            logfile.close()
    results.append(list(futures.map(parse_log, tasks)))
    for result in results:
        for data in result:
            data_collector = receive_result(data_collector,data)
    return data_collector
Example #46
0
def main(argv):
    assert len(argv)==3
    xprmtDir = cfg.xprmtDir+'/'+argv[1]; print xprmtDir; assert os.path.isdir(xprmtDir)

    nTop = int(argv[2])
    metrics = defaultdict(list)

    #
    X_train = np.genfromtxt(xprmtDir+'/data/X_train.csv', delimiter=',')
    X_test = np.genfromtxt(xprmtDir+'/data/X_test.csv', delimiter=',')
    y_train = np.genfromtxt(xprmtDir+'/data/y_train.csv', delimiter=',')
    y_test = np.genfromtxt(xprmtDir+'/data/y_test.csv', delimiter=',')

    #
    param = dict()
    with open(xprmtDir+'/log2.json') as f:
        param = yaml.load(f)

    hofFilepath = xprmtDir+'/gen-'+str(param['nGen']-1)+'/hofIndividual.csv'

    funcStrList = []
    with open(hofFilepath, 'r') as f:
        funcStrList = f.readlines()
    funcStrList = [f for f in funcStrList if len(f)!=0]

    if nTop > len(funcStrList):
        nTop = len(funcStrList)

    funcStrList = funcStrList[0:nTop] # take only the nTop best func/individual
    funcStrList.append( util.tanimotoStr() )

    funcStrList = [s.rstrip() for s in funcStrList]
    funcStrList = [util.expandFuncStr(s) for s in funcStrList]
    metrics['funcStr'] = funcStrList

    nIndividual = len(funcStrList)
    perfList = fu.map (tuneTrainTest,funcStrList, [X_train]*nIndividual, [y_train]*nIndividual, [X_test]*nIndividual, [y_test]*nIndividual)

    for p in perfList:
        metrics['accuracy'].append( p[0])
        metrics['precision'].append(p[1])
        metrics['recall'].append(p[2])
        metrics['fscore'].append([3])
        metrics['support'].append([4])

    with open(xprmtDir+"/data/perf_metrics.json", 'wb') as f:
        json.dump(metrics, f, indent=2, sort_keys=True)
Example #47
0
 def test_concurrent_scoop(self):
     # test several restarts
     old_iter = self.ITERATIONS
     for jrun in range(3):
         self.ITERATIONS = 11
         url = get_random_port_url()
         filename = make_temp_dir('locker_test/scoop.txt')
         self.create_file(filename)
         self.start_server(url)
         lock = LockerClient(url)
         lock.start()
         iterator = [(irun, lock, filename) for irun in range(self.ITERATIONS)]
         list(futures.map(the_job, iterator))
         lock.send_done()
         self.check_file(filename)
         self.lock_process.join()
         # errwrite(str(irun))
     self.ITERATIONS = old_iter
def selectSVD(atlas, minDiv=1e-2):
    atlas.sort(lambda x, y: cmp(x.shift, y.shift))
    U, s, V = evalSVD(atlas)
    
    for i, ind in enumerate(atlas):
        ind.label = i
    
    def selParetoRank(sValue, vValues):
        if sValue < 1e-5 :
            return []
        
        def calcFitness(uVal, ind):
            shift, robust = ind.shift, ind.robustness
            norm = np.abs(uVal)
            if norm < minDiv:
                ind.fitness.values = 0., np.inf, 0., len(ind)
            elif not np.isfinite(shift):
                ind.fitness.values = 0., np.inf, 0., len(ind)
            else:
                ind.fitness.values = shift*norm, robust/norm, norm, len(ind)
            ind.svd = norm
            ind.size = len(ind)
        map(calcFitness, vValues, atlas)            

        nRet = 5
        if len(atlas) < nRet:
            nRet = len(atlas)
        return tools.selSPEA2(atlas, 5)
        #return returnParetoFront(atlas)
    
    selected = sum(futures.map(selParetoRank, s, V), [])
    
    #Discard Duplicates
    removed = []
    for ind in selected:
        addInd = True
        for ind2 in removed:
            if ind2.label == ind.label:
                addInd = False
        if addInd:
            removed.append(ind)
    
    return removed
def pi_calculus_with_Montecarlo_Method(workers, attempts):
    print("number of workers %i - number of attempts %i" %(workers,attempts)) 
    bt = time()
    #in this point we call scoop.futures.map function
    #the evaluate_number_of_points_in_unit_circle \
    #function is executed in an asynchronously way
    #and several call this function can be made cuncurrently
    evaluate_task = \
                  futures.map(evaluate_points_in_circle, \
                       [attempts] * workers)
    Taskresult = sum(evaluate_task)
    print ("%i points fallen in a unit disk after " \
           %(Taskresult/attempts))
    piValue = (4. * Taskresult / float(workers * attempts))
    
    computationalTime = time() - bt
    print("value of pi = " + str(piValue))
    print ("error percentage = " + \
           str((((abs(piValue - math.pi)) * 100) / math.pi)))
    print("total time: " + str(computationalTime))
Example #50
0
def mergesort(lst, current_depth=0, parallel_depth=0):
    if len(lst) <= 1:
        return lst
    middle = int(len(lst) / 2)
    if current_depth < parallel_depth:
        results = list(
            futures.map(
                mergesort,
                [
                    lst[:middle],
                    lst[middle:],
                ],
                repeat(current_depth+1),
                repeat(parallel_depth),
            )
        )
    else:
        results = []
        results.append(mergesort(lst[:middle]))
        results.append(mergesort(lst[middle:]))
    return merge(*results)
Example #51
0
def execute(dbHost, dbPort, dbName, start, stop, cases, numPlayers,
            initialCash, numTurns, ops, fops):
    """
    Executes a general experiment.
    @param dbHost: the database host
    @type dbHost: L{str}
    @param dbPort: the database port
    @type dbPort: L{int}
    @param dbName: the database collection name
    @type dbName: L{str}
    @param start: the starting seed
    @type start: L{int}
    @param stop: the stopping seed
    @type stop: L{int}
    @param cases: the list of designs to execute
    @type cases: L{list}
    @param numPlayers: the number of players
    @type numPlayers: L{int}
    @param initialCash: the initial cash
    @type initialCash: L{int}
    @param numTurns: the number of turns
    @type numTurns: L{int}
    @param ops: the operations definition
    @type ops: L{str}
    @param fops: the federation operations definition
    @type fops: L{str}
    """
    executions = [(dbHost, dbPort, dbName,
                   [e for e in elements.split(' ') if e != ''],
                   numPlayers, initialCash, numTurns, seed, ops, fops)
        for (seed, elements) in itertools.product(range(start, stop), cases)]
    numComplete = 0.0
    logging.info('Executing {} cases with seeds from {} to {} for {} total executions.'
                 .format(len(cases), start, stop, len(executions)))
    for results in futures.map(queryCase, executions):
        print results
Example #52
0
def func1(n):
  # To force an immediate evaluation, you can wrap your map in a list such as:
  result = list(futures.map(func2, [i+1 for i in range(n)]))
  return sum(result)
Example #53
0
def func3(n):
  # This call results in a generator function
  result = futures.map(func4, [i+1 for i in range(n)])
  # The results are evaluated here when they are accessed.
  return sum(result)
Example #54
0
def main():
    if len(sys.argv)!=4:
        print 'USAGE:'
        print 'python -m scoop devel.py [cloneID] [clusterDir] [outputDir]'
        print 'see devel_config.py'
        return

    cloneID = sys.argv[1]
    clusterDir = sys.argv[2]; assert clusterDir[-1]=='/',"should be ended with '/'"
    baseOutDir = sys.argv[3]; assert baseOutDir[-1]!='/',"should NOT be ended with '/'"

    clfParam = None
    method = cfg['method']
    if method=='esvm':
        from esvm_config import config as clfParam
    elif method=='psvm':
        from psvm_config import config as clfParam
    else:
        print 'FATAL: unknown method'
        return

    outDir = os.path.join(baseOutDir,'devel-'+os.path.basename(baseOutDir))
    if not(os.path.isdir(baseOutDir)): os.makedirs(baseOutDir)
    if not(os.path.isdir(outDir)): os.makedirs(outDir)

    ## Load data ###################################################################################
    dataLog = {}; dataLogFpath = os.path.join(outDir,'data_log_'+os.path.basename(baseOutDir)+'.json')
    dataset = clusterDir.split('/')[-2].split('-')[-1]; dataLog['dataset'] = dataset
    datasetParams = dataset.split('#')
    assert datasetParams[0]=='yamanishi'

    xyDevFpath = os.path.join(baseOutDir,'_'.join(['xdev','ydev','xrel','yrel']+datasetParams)+'.h5')
    if os.path.exists(xyDevFpath):
        print 'loading data from PREVIOUS...'

        with h5py.File(xyDevFpath,'r') as f:
            xdev = f['xdev'][:]
            ydev = f['ydev'][:]
            xrel = f['xrel'][:]
            yrel = f['yrel'][:]
            xrelraw = f['xrelraw'][:]

        with open(dataLogFpath,'r') as f:
            dataLog = yaml.load(f)

    else:
        print 'loading data FRESHLY...'

        print 'loading cluster result...'
        nUnlabels = []
        statFnames = [i for i in os.listdir(clusterDir) if 'labels_stat.json' in i]
        for i in statFnames:
            with open(os.path.join(clusterDir,i),'r') as f: stat = yaml.load(f)
            nUnlabels.append(stat['0'])

        # use the cluster with minimum numbers of unlabeled samples
        metric = '_'.join(statFnames[ nUnlabels.index(min(nUnlabels)) ].split('_')[0:2])
        dataLog['metric'] = metric

        connFpath = os.path.join(clusterDir,metric+'_labels.pkl')
        with open(connFpath,'r') as f:
            data = pickle.load(f)

        ##
        print 'getting devel and release data...'
        xraw = []; yraw = []
        for k,v in data.iteritems():
            for vv in v:
                xraw.append(vv)
                yraw.append(k)

        devIdx = [i for i in range(len(xraw)) if yraw[i]!=0]
        xdev = [xraw[i] for i in devIdx]
        ydev = [yraw[i] for i in devIdx]

        relIdx = [i for i in range(len(xraw)) if yraw[i]==0]
        xrel = [xraw[i] for i in relIdx]
        yrel = [yraw[i] for i in relIdx]

        dataLog['nDevel'] = len(devIdx); dataLog['nData'] = len(yraw)
        dataLog['rDevel:Data'] = dataLog['nDevel']/float(dataLog['nData'])
        dataLog['nDevel(+)'] = len( [i for i in ydev if i==1] ); assert dataLog['nDevel(+)']!=0
        dataLog['nDevel(-)'] = len( [i for i in ydev if i==-1] ); assert dataLog['nDevel(-)']!=0
        dataLog['rDevel(+):Devel'] = float(dataLog['nDevel(+)'])/dataLog['nDevel']
        dataLog['rDevel(-):Devel'] = float(dataLog['nDevel(-)'])/dataLog['nDevel']
        dataLog['rDevel(+):(-)'] = float(dataLog['nDevel(+)'])/float(dataLog['nDevel(-)'])
        dataLog['nRelease'] = len(relIdx);
        dataLog['rRelease:Data'] = dataLog['nRelease']/float(dataLog['nData'])

        ##
        print 'loading com, pro feature...'
        krFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature',
                               'klekotaroth','klekotaroth-'+datasetParams[1]+'.h5')
        aacFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature',
                                'amino-acid-composition','amino-acid-composition-'+datasetParams[1]+'.h5')

        krDict = {}; aacDict = {}
        with h5py.File(krFpath, 'r') as f:
            for com in [str(i) for i in f.keys()]:
                krDict[com] = f[com][:]
        with h5py.File(aacFpath, 'r') as f:
            for pro in [str(i) for i in f.keys()]:
                aacDict[pro] = f[pro][:]
                # aacDict[pro] = list( fu.map(lambda x: float('%.2f'%(x)),f[pro][:]) ) # rounding

        comFeaLenOri = len(krDict.values()[0])
        proFeaLenOri = len(aacDict.values()[0])

        ##
        msg = 'extract (com,pro) feature... dims: '+str(comFeaLenOri)+','+str(proFeaLenOri)
        msg += ' of '+str(len(ydev))+' and '+str(len(yrel))
        print msg

        sh.setConst(krDict=krDict)
        sh.setConst(aacDict=aacDict)
        xdevf = list( fu.map(cutil.extractComProFea,xdev) )
        xrelf = list( fu.map(cutil.extractComProFea,xrel) )

        ##
        xyDevList = cutil.divideSamples(xdevf,ydev,cfg['smoteBatchSize'])
        if cfg['maxNumberOfSmoteBatch'] != 0:
            xyDevList = xyDevList[0:cfg['maxNumberOfSmoteBatch']]

        smoteSeed = util.seed(); dataLog['smoteSeed'] = smoteSeed
        sh.setConst(smoteSeed=smoteSeed)

        print 'resampling via Smote FRESHLY... '+str(len(xyDevList))+' smote(s)'+' on '+str(len(ydev))
        smoteTic = time.time()

        xdevfr = []; ydevr = []
        xydevfrList = list( fu.map(ensembleSmote,xyDevList) )
        for xdevfri,ydevri in xydevfrList:
            for x in xdevfri: xdevfr.append(x.tolist())
            for y in ydevri: ydevr.append(y)
        assert len(xdevfr)==len(ydevr),'len(xdevfr)!=len(ydevr)'

        dataLog['nSmote'] = len(xyDevList)
        dataLog['nDevelResampled'] = len(ydevr)
        dataLog['rDevelResampled:Data'] = dataLog['nDevelResampled']/float(dataLog['nData'])
        dataLog['nDevelResampled(+)'] = len( [i for i in ydevr if i==1] )
        dataLog['nDevelResampled(-)'] = len( [i for i in ydevr if i==-1] )
        dataLog['rDevelResampled(+):DevelResampled'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled'])
        dataLog['rDevelResampled(-):DevelResampled'] = dataLog['nDevelResampled(-)']/float(dataLog['nDevelResampled'])
        dataLog['rDevelResampled(+):(-)'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled(-)'])
        dataLog['timeSMOTE'] =  str(time.time()-smoteTic)

        ##
        print 'update xdev,ydev,xrel... '+str(np.asarray(xdevfr).shape)
        xrelraw = xrel[:] # raw: feature is NOT extracted
        xrel = xrelf[:]
        xdev = xdevfr[:]
        ydev = ydevr[:]

        print 'writing updated xdev,ydev and xrel,yrel...'
        with h5py.File(xyDevFpath,'w') as f:
            f.create_dataset('xdev',data=xdev,dtype=np.float32)
            f.create_dataset('ydev',data=ydev,dtype=np.int8)
            f.create_dataset('xrel',data=xrel,dtype=np.float32)
            f.create_dataset('yrel',data=yrel,dtype=np.int8)
            f.create_dataset('xrelraw',data=xrelraw)

        print 'writing dataLog...'
        dataLog['nCom'] = len(krDict)
        dataLog['nPro'] = len(aacDict)
        with open(dataLogFpath,'w') as f:
            json.dump(dataLog,f,indent=2,sort_keys=True)

    ## TUNE+TRAIN+TEST #############################################################################
    devLog = {}
    devSeed = util.seed(); dataLog['devSeed'] = devSeed
    tag = '_'.join([method+'#'+cloneID,dataset,util.tag()])

    ## split devel dataset
    msg = ' '.join( ['devel',dataset,cloneID])
    xtr,xte,ytr,yte = tts(xdev,ydev,test_size=cfg['testSize'],
                          random_state=devSeed,stratify=ydev)

    if cfg['maxTestingSamples']>0:
        chosenIdx = np.random.randint(len(xte),size=cfg['maxTestingSamples'])
        xte = [xte[i] for i in chosenIdx]; yte = [yte[i] for i in chosenIdx]

    devLog['nTraining'] = len(xtr)
    devLog['nTraining(+)'] = len([i for i in ytr if i==1])
    devLog['nTraining(-)'] = len([i for i in ytr if i==-1])
    devLog['rTraining(+):(-)'] = devLog['nTraining(+)']/float(devLog['nTraining(-)'])
    devLog['rTraining:Devel'] = devLog['nTraining']/float(dataLog['nDevelResampled'])
    devLog['nTesting'] = len(xte)
    devLog['nTesting(+)'] = len([i for i in yte if i==1])
    devLog['nTesting(-)'] = len([i for i in yte if i==-1])
    devLog['rTesting(+):(-)'] = devLog['nTesting(+)']/float(devLog['nTesting(-)'])
    devLog['rTesting:Devel'] = devLog['nTesting']/float(dataLog['nDevelResampled'])

    ## tuning
    clf = None
    if method=='esvm':
        clf  = eSVM(simMat=None)
    elif method=='psvm':
        clf = svm.SVC(kernel=clfParam['kernel'],probability=True)

    ## training
    print msg+': fitting nTr= '+str(len(ytr))
    trTic = time.time()

    if method=='esvm':
        clf.fit(xtr,ytr)
        devLog['labels'] = clf.labels()
        devLog['nSVM'] = clf.nSVM()
        devLog['xtrDimAllBatches'] = clf.xtrDimAllBatches()
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTr = cutil.makeComProKernelMatFromSimMat(xtr,xtr,simMat)
            # clf.fit(simMatTr,ytr)
        else:
            clf.fit(xtr,ytr)
        devLog['labels'] = clf.classes_.tolist()
    devLog['timeTraining'] = str(time.time()-trTic)

    ## testing
    print msg+': predicting nTe= '+str(len(yte))
    teTic = time.time()

    if method=='esvm':
        ypred,yscore = clf.predict(xte)
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTe = cutil.makeComProKernelMatFromSimMat(xte,xtr,simMat)
            # ypred = clf.predict(simMatTe)
            # yscore = clf.predict_proba(simMatTe)
        else:
            ypred = clf.predict(xte)
            yscore = clf.predict_proba(xte)
            yscore = [max(i.tolist()) for i in yscore]
    devLog['timeTesting'] = str(time.time()-teTic)

    ## TEST RELEASE ################################################################################
    print msg+': predicting RELEASE n= '+str(len(yrel))
    relTic = time.time()

    if method=='esvm':
        yrel,yrelscore = clf.predict(xrel)
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTe = cutil.makeComProKernelMatFromSimMat(xrel,xtr,simMat)
            # yrel = clf.predict(simMatTe)
            # yrelscore = clf.predict_proba(simMatTe)
        else:
            yrel = clf.predict(xrel)
            yrelscore = clf.predict_proba(xrel)
            yrelscore = [max(i.tolist()) for i in yrelscore]
    devLog['timeRelease'] = str(time.time()-relTic)

    ## WRITE RESULT ################################################################################
    result = {'yte':yte,'ypred':ypred,'yscore':yscore,
              'xrelraw':xrelraw,'yrel':yrel,'yrelscore':yrelscore}

    print 'writing prediction...'
    with h5py.File(os.path.join(outDir,'result_'+tag+'.h5'),'w') as f:
        for k,v in result.iteritems():
            if 'raw' in k:
                f.create_dataset(k,data=v)
            else:
                dt = np.int8
                if 'score' in k: dt = np.float32
                f.create_dataset(k,data=v,dtype=dt)

    ##
    print 'writing devLog...'
    devLog['clfParam'] = clfParam
    devLog['devParam'] = cfg
    with open(os.path.join(outDir,'devLog_'+tag+'.json'),'w') as f:
        json.dump(devLog,f,indent=2,sort_keys=True)
Example #55
0
from scoop import futures
import nlp
import glob
import sys

UPLOAD_PATH = './uploads/'
def analyze_emotion(filename):
    with open(filename, 'rU') as f:
        return {filename:dict(nlp.emotion_analysis(f.read()))}

if __name__ == "__main__":
    files = glob.glob(UPLOAD_PATH+'*.txt') + glob.glob(UPLOAD_PATH+'*.data')
    ret = futures.map(analyze_emotion, files)
    print([x for x in ret if isinstance(x,dict)])
Example #56
0
 def my_map(*args, **kwargs):
     return list(futures.map(*args, **kwargs))
Example #57
0
def map_(*args, **kwargs):
    return map(WorkerWrapper(args[0]), *args[1:], **kwargs)
Example #58
0
def worker_run_simple(counter):
    """Execute the cmd
        to be called with
    """
    cmd_sanity = ["%s" % x for x in parse_worker_args()]  ## ready to join
    set_scoop_env('counter', counter)
    ec, out = run_simple(' '.join(cmd_sanity), disable_log=True)

    return  ec, out  ## return 1 item

if __name__ == '__main__':
    _log = make_worker_log(NAME, debug=_DEBUG)

    worker_func = worker_run_simple

    res = None
    start, stop, step = parse_worker_args(False)
    try:
        _log.debug("main_run: going to start map")
        res_generator = futures.map(worker_func, xrange(start, stop, step))
        _log.debug("main_run: finished map")
        res = [x for x in res_generator]
        _log.debug("main_run: finished res from generator")
    except:
        _log.exception("main_run: main failed with main_func %s with start %s stop %s" % (worker_func, start, stop))

    print res


  start = time.time()
  for q in y:
    x*q
  print 'for loop took %f seconds' %(time.time()-start) 
  
  start = time.time()
  [x*q for q in y]
  print 'list comp took %f seconds' %(time.time()-start)
  
  start = time.time()
  x*z
  print 'broadcasting took %f seconds' %(time.time()-start) 
  

  start = time.time()
  map(lambda q: x*q, y)
  print 'serial map took %f seconds' %(time.time()-start) 

  
  
  start = time.time()
  futures.map(lambda q: x*q, y)
  print 'parallel map took %f seconds' %(time.time()-start)
  
      
  flong = flop(x)
  start = time.time()
  futures.map(flong.compute, y)
  print 'parallel map with method took %f seconds' %(time.time()-start)