def cast_data(header, tablename, data): typedict = get_typedict(tablename) type_casters = [] for i in range(len(header)): sql_type = typedict[header[i]] if sql_type == text_type: type_casters.append(lambda str: str.encode('UTF-8')) #type_casters.append(lambda passer: passer) elif sql_type == int_type: type_casters.append(int) elif sql_type == date_type: type_casters.append(timestamp_parser.parse) log('casting data for ' + str(len(data)) + " rows") def cast_line(dataln): cast_line = [] for col_id in range(len(dataln)): cast_line.append(type_casters[col_id](dataln[col_id])) return cast_line tpool = Pool(processes=6) ret = tpool.map(cast_line, data) tpool.close() return ret
def runMultiProcessTrajectories(self, repeat): pool=Pool(processes=len(self.posIni)) result = pool.map(partial(self.runNtrajectory, repeat=repeat) , [(x, y) for x, y in self.posIni]) pool.close() pool.join() meanCost, meanTraj=0., 0. for Cost, traj in result: meanCost+=Cost meanTraj+=traj size = len(result) return meanCost/size, meanTraj/size
def runMultiProcessTrajectories(self, repeat): pool = Pool(processes=len(self.posIni)) result = pool.map(partial(self.nTraj, repeat=repeat), [(x, y) for x, y in self.posIni]) pool.close() pool.join() meanCost, meanTraj = 0, 0 for CostCMAES, traj in result: meanCost += CostCMAES meanTraj += traj size = len(result) return meanCost / size, meanTraj / size
def filter(dirty_data): log("starting filter") tpool = Pool(processes=cpus) ret = [] log("filtering deleted and not english") for line in tpool.map(Filter.__is_not_deleted_or_not_non_english, dirty_data): if line[1]: ret.append(line[0]) def clean_links_and_punctuation(comment): words = comment.split(" ") words = list(map(Filter.__filter_links, words)) comment = reduce(lambda x, y: x + " " + y, words) return comment log("filtering links and punctuation") ret = tpool.map(clean_links_and_punctuation, ret) tpool.close() log("filter done") return ret
def run(self): nproc = PipeEnum.PARALLEL_N_PROCS.value nchunks = PipeEnum.PARALLEL_N_CHUNKS.value if nproc in self.kwargs: n_processes = self.kwargs[nproc] chunks = self.kwargs.get(nchunks, 1) pool = Pool(n_processes) self.output = [ i for i in pool.map( self.map_function, self.reader.data, chunksize=chunks) ] else: self.output = self.map_function(self.reader.data)
def clean_data(): rows_per_loop = 100000 log("") log("starting") dirty_db_path = ROOTDIR + dir_sep + "stage_2_clean.db" clean_db_path = ROOTDIR + dir_sep + "stage_3_cleaner.db" dirty_db_cursor = create_connection(dirty_db_path).cursor() clean_db = create_connection(clean_db_path) clean_db_cursor = clean_db.cursor() clean_db_cursor.execute("DELETE FROM bodies") clean_db_cursor.execute("delete from sqlite_sequence where name='bodies'") dirty_db_cursor.execute("select bodies from bodies") data = dirty_db_cursor.fetchmany(rows_per_loop) tpool = Pool(processes=4) locp_n = 1 log("detected " + str(cpus) + " as cpu count") inserted = 0 more_data = True while more_data: log("cleaning data") data = tpool.map(clean_line, data) data = list(map(lambda line: (line, ), data)) log("inserting 100k rows") query = "insert into bodies (bodies) values (?)" clean_db_cursor.executemany(query, data) clean_db.commit() log("done loop, getting more data.") inserted += len(data) data = dirty_db_cursor.fetchmany(rows_per_loop) #more_data = False if len(data) < 1: more_data = False log("end of data") log("done " + str(locp_n) + " loops") locp_n += 1 log("done") log("inserted " + str(inserted) + " rows")
def runNtrajectory(self, (x, y), repeat): costAll, trajTimeAll = np.zeros(repeat), np.zeros(repeat) for i in range(repeat): costAll[i], trajTimeAll[i] = self.runOneTrajectoryOpti(x, y) meanCost = np.mean(costAll) meanTrajTime = np.mean(trajTimeAll) self.costStore.append([x, y, meanCost]) self.trajTimeStore.append([x, y, meanTrajTime]) return meanCost, meanTrajTime def mapableTrajecrtoryFunction(self,x,y,useless): return self.runOneTrajectory(x, y) def runNtrajectoryMulti(self, (x, y), repeat): pool=Pool(processes=4) result = pool.map(partial(self.mapableTrajecrtoryFunction,x,y) , range(repeat)) pool.close() pool.join() meanCost, meanTraj=0., 0. for Cost, traj in result: meanCost+=Cost meanTraj+=traj size = len(result) return meanCost/size, meanTraj/size def runOneTrajectoryOpti(self, x, y): #self.tm.saveTraj = True cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y) #cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y) #print "Exp local x y cost : ", x, y, cost
def pre_stat(paras, df_microsatellites): # reference=paras["reference"] path_pre_stat = paras["output"].rstrip("/") + "/" + get_value( "case") + ".stat" path_pre_stat_tmp = paras["output_tmp"].rstrip("/") + "/" + get_value( "case") + ".stat" file_all_stat = open(path_pre_stat, "w") file_all_stat.write("\t".join([ "repeat_unit_length", "repeat_times", "num_forward", "num_reversed", "this_repeat_mean_mean", "this_repeat_mean_std", "this_repeat_std_mean", "this_repeat_std_std", "forward_prefix", "forward_ms", "forward_suffix", "reversed_prefix", "reversed_ms", "reversed_suffix" ]) + "\n") df_microsatellites_download_sample = microsatellites_sampling( df_microsatellites, paras) for repeat_unit, info in df_microsatellites_download_sample.items(): for repeat_times, ms_infos in info.items(): logger.info("Processing repeat unit: " + str(repeat_unit) + " repeat times: " + str(repeat_times)) infos = [] for id, info in ms_infos.iterrows(): info["reference"] = paras["reference"] info["prefix_len"] = paras["prefix_len"] info["suffix_len"] = paras["suffix_len"] infos.append(info) pool = Pool(processes=paras["threads"]) res_infos = pool.map(process_one_ms, infos) pool.close() pool.join() suffix_str = "." + str(repeat_unit) + "." + str(repeat_times) file = open(path_pre_stat_tmp + suffix_str + ".repeat", "w") this_repeat_means = [] this_repeat_stds = [] num_forward = 0 num_reversed = 0 prefix_forward = [] suffix_forward = [] ms_forward = [] prefix_reversed = [] suffix_reversed = [] ms_reversed = [] for res in res_infos: if None not in res: file.write("\t".join(map(str, res[:-2])) + "\n") this_repeat_means.append(res[3]) this_repeat_stds.append(res[4]) prefix_forward.extend(res[-1]["prefix_forward"]) suffix_forward.extend(res[-1]["suffix_forward"]) ms_forward.extend(res[-1]["ms_forward"]) prefix_reversed.extend(res[-1]["prefix_reversed"]) suffix_reversed.extend(res[-1]["suffix_reversed"]) ms_reversed.extend(res[-1]["ms_reversed"]) num_forward += res[-1]["num_forward"] num_reversed += res[-1]["num_reversed"] file.close() if num_forward + num_reversed < 2: continue this_repeat_mean_mean = np.mean(this_repeat_means) this_repeat_mean_std = np.std(this_repeat_means) this_repeat_std_mean = np.mean(this_repeat_stds) this_repeat_std_std = np.std(this_repeat_stds) pd.concat( [ pd.DataFrame( [np.nanmean(np.array(prefix_forward), axis=0)]), pd.DataFrame([np.nanmean(np.array(ms_forward), axis=0)]), pd.DataFrame( [np.nanmean(np.array(suffix_forward), axis=0)]) ], axis=1, ).to_csv(path_pre_stat_tmp + suffix_str + ".forward.qual") pd.concat( [ pd.DataFrame( [np.nanmean(np.array(prefix_reversed), axis=0)]), pd.DataFrame([np.nanmean(np.array(ms_reversed), axis=0)]), pd.DataFrame( [np.nanmean(np.array(suffix_reversed), axis=0)]) ], axis=1, ).to_csv(path_pre_stat_tmp + suffix_str + ".reversed.qual") forward_prefix = np.nanmean(prefix_forward) forward_ms = np.nanmean(ms_forward) forward_suffix = np.nanmean(suffix_forward) reversed_prefix = np.nanmean(prefix_reversed) reversed_ms = np.nanmean(ms_reversed) reversed_suffix = np.nanmean(suffix_reversed) this_info_list = list( map(str, [ repeat_unit, repeat_times, num_forward, num_reversed, this_repeat_mean_mean, this_repeat_mean_std, this_repeat_std_mean, this_repeat_std_std, forward_prefix, forward_ms, forward_suffix, reversed_prefix, reversed_ms, reversed_suffix ])) file_all_stat.write("\t".join(this_info_list) + "\n") file_all_stat.close() return