def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) output_names = dat.get_output_names(opts.data_files[0], regex=opts.output_names) stats = OrderedDict() for name in output_names: output = hdf.read(opts.data_files, 'outputs/%s' % name, nb_sample=opts.nb_sample) output = list(output.values())[0] stats[name] = get_output_stats(output) tmp = [] for key, value in six.iteritems(stats): tmp.append(pd.DataFrame(value, index=[key])) stats = pd.concat(tmp) stats.index.name = 'output' stats.reset_index(inplace=True) print(stats.to_string()) if opts.out_tsv: stats.to_csv(opts.out_tsv, sep='\t', index=False) if opts.out_fig: plot_stats(stats).savefig(opts.out_fig) return 0
class FractionTaxaBarStack(Graph): """Comparing all fractions across all pools in a barstack""" short_name = 'fraction_taxa_barstack' def plot(self): self.frame = OrderedDict((('%s - %s' % (p,f), getattr(p.fractions, f).rdp.phyla) for f in ('low', 'med', 'big') for p in self.parent.pools)) self.frame = pandas.DataFrame(self.frame) self.frame = self.frame.fillna(0) self.frame = self.frame.transpose() self.frame = self.frame.apply(lambda x: 100*x/x.sum(), axis=1) # Sort the table by sum # sums = self.frame.sum() sums.sort(ascending=False) self.frame = self.frame.reindex_axis(sums.keys(), axis=1) # Plot # fig = pyplot.figure() axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors) fig = pyplot.gcf() # Other # axes.set_title('Species relative abundances per fraction per pool') axes.set_ylabel('Relative abundances in percent') axes.xaxis.grid(False) axes.yaxis.grid(False) axes.set_ylim([0,100]) # Put a legend below current axis axes.legend(loc='upper center', bbox_to_anchor=(0.5, -0.20), fancybox=True, shadow=True, ncol=5) # Save it # self.save_plot(fig, axes, width=24.0, height=14.0, bottom=0.30, top=0.97, left=0.04, right=0.98) self.frame.to_csv(self.csv_path) pyplot.close(fig)
def save_results(self): results_dict = OrderedDict() results_dict['Date'] = [datetime.datetime.now().strftime('%a %d %b')] results_dict['Correct'] = [len(self.correct)] results_dict['Answered'] = [len(self.correct) + len(self.wrong)] results_dict['Wrong'] = [sorted(self.wrong)] results_dict['Topic'] = [self.topic] results_df = pd.DataFrame(data=results_dict) try: results_dict = pd.read_csv(f'results/{self.course}.csv', sep='\t') results_dict = results_dict.append(results_df) results_dict.to_csv(f'results/{self.course}.csv', sep='\t', index=False) except: results_df.to_csv(f'results/{self.course}.csv', sep='\t', index=False)
class FractionTaxaBarStack(Graph): """Comparing all fractions across all pools in a barstack""" short_name = 'fraction_taxa_barstack' def plot(self): self.frame = OrderedDict( (('%s - %s' % (p, f), getattr(p.fractions, f).rdp.phyla) for f in ('low', 'med', 'big') for p in self.parent.pools)) self.frame = pandas.DataFrame(self.frame) self.frame = self.frame.fillna(0) self.frame = self.frame.transpose() self.frame = self.frame.apply(lambda x: 100 * x / x.sum(), axis=1) # Sort the table by sum # sums = self.frame.sum() sums.sort(ascending=False) self.frame = self.frame.reindex_axis(sums.keys(), axis=1) # Plot # fig = pyplot.figure() axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors) fig = pyplot.gcf() # Other # axes.set_title('Species relative abundances per fraction per pool') axes.set_ylabel('Relative abundances in percent') axes.xaxis.grid(False) axes.yaxis.grid(False) axes.set_ylim([0, 100]) # Put a legend below current axis axes.legend(loc='upper center', bbox_to_anchor=(0.5, -0.20), fancybox=True, shadow=True, ncol=5) # Save it # self.save_plot(fig, axes, width=24.0, height=14.0, bottom=0.30, top=0.97, left=0.04, right=0.98) self.frame.to_csv(self.csv_path) pyplot.close(fig)
class CollectionGFF(Parser): def __init__(self, in_file=None, records=None, format="gff", parsing_mode="only_coordinates", black_list=(), white_list=(), featuretype_separation=False, scaffold_syn_dict=None): """ IMPORTANT: coordinates are converted to 0-based :param in_file: :param records: :param format: :param parsing_mode: :param black_list: :param white_list: :param featuretype_separation: :param scaffold_syn_dict: """ self.formats = ["gff", "gtf", "bed"] self.GFF_COLS = AnnotationFormats.GFF_COLS self.BED_COLS = AnnotationFormats.BED_COLS self.parsing_parameters = { "gff": { "only_coordinates": { "col_names": ["scaffold", "start", "end"], "cols": [0, 3, 4], "index_cols": "scaffold", "converters": { "scaffold": str, "start": lambda x: np.int32(x) - 1, "end": np.int32, }, "col_name_indexes": { "scaffold": 0, "start": 1, "end": 2 }, }, "coordinates_and_type": { "col_names": ["scaffold", "featuretype", "start", "end"], "cols": [0, 2, 3, 4], "index_cols": ["scaffold"], "converters": { "scaffold": str, "featuretype": str, "start": lambda x: np.int32(x) - 1, "end": np.int32, }, "col_name_indexes": { "scaffold": 0, "featuretype": 1, "start": 2, "end": 3 }, }, "coord_and_attr": { "col_names": [ "scaffold", "featuretype", "start", "end", "strand", "attributes" ], "cols": [0, 2, 3, 4, 6, 8], "index_cols": ["scaffold", "featuretype"], "converters": { "scaffold": str, "featuretype": str, "start": lambda x: np.int32(x) - 1, "end": np.int32, "strand": str, "attributes": str, }, "col_name_indexes": { "scaffold": 0, "featuretype": 1, "start": 2, "end": 3, "strand": 4, "attributes": 5, }, }, "all": { "col_names": [ "scaffold", "source", "featuretype", "start", "end", "score", "strand", "phase", "attributes" ], "cols": [0, 1, 2, 3, 4, 5, 6, 7, 8], "index_cols": ["scaffold"], "converters": { "scaffold": str, "source": str, "featuretype": str, "start": lambda x: np.int32(x) - 1, "end": np.int32, "score": str, "strand": str, "phase": str, "attributes": str, }, "col_name_indexes": { "scaffold": 0, "source": 1, "featuretype": 2, "start": 3, "end": 4, "score": 5, "strand": 6, "phase": 7, "attributes": 8 }, }, "complete": { "col_names": [ "scaffold", "source", "featuretype", "start", "end", "score", "strand", "phase", "attributes" ], "cols": [0, 1, 2, 3, 4, 5, 6, 7, 8], "index_cols": ["scaffold"], "converters": { "scaffold": str, "source": str, "featuretype": str, "start": lambda x: np.int32(x) - 1, "end": np.int32, "score": str, "strand": str, "phase": str, "attributes": str, }, "col_name_indexes": { "scaffold": 0, "source": 1, "featuretype": 2, "start": 3, "end": 4, "score": 5, "strand": 6, "phase": 7, "attributes": 8 }, }, }, "bed": { "only_coordinates": { "col_names": ["scaffold", "start", "end"], "cols": [0, 1, 2], "index_cols": "scaffold", "converters": { "scaffold": str, "start": np.int32, "end": np.int32, }, "col_name_indexes": { "scaffold": 0, "start": 1, "end": 2 }, }, } } self.parsing_mode = parsing_mode self.featuretype_separation = featuretype_separation self.featuretype_parsing_modes = [ "coordinates_and_type", "all", "coord_and_attr", "complete" ] self.attributes_parsing_modes = ["complete", "coord_and_attr"] self.format = format self.black_list = black_list self.white_list = white_list self.featuretype_list = [] self.scaffold_syn_dict = scaffold_syn_dict # attributes type conversion parameters self.parameter_separator_dict = OrderedDict() self.default_replace_dict = OrderedDict({".": None}) self.converters = OrderedDict() self.pandas_int_type_correspondence = OrderedDict({ "Int8": np.float16, "Int16": np.float16, "Int32": np.float32, "Int64": np.float64, }) # init aliases self.record_id_col = self.parsing_parameters[self.format][ self.parsing_mode]["col_name_indexes"]["scaffold"] self.record_start_col = self.parsing_parameters[self.format][ self.parsing_mode]["col_name_indexes"]["start"] self.record_end_col = self.parsing_parameters[self.format][ self.parsing_mode]["col_name_indexes"]["end"] self.col_names = self.parsing_parameters[self.format][ self.parsing_mode]["col_names"] self.index_cols = self.parsing_parameters[self.format][ self.parsing_mode]["index_cols"] # load records self.featuretype_list = None if in_file: self.read(in_file, format=format, parsing_mode=parsing_mode, black_list=black_list, white_list=white_list, featuretype_separation=featuretype_separation) else: self.records = records if featuretype_separation and (self.parsing_mode in self.featuretype_parsing_modes): self.scaffold_dict = OrderedDict([ (featuretype, self.records[featuretype].index.get_level_values( 'scaffold').unique().to_list()) for featuretype in self.featuretype_list ]) self.scaffold_list = list(self.scaffold_dict.values()) else: self.scaffold_list = self.records.index.get_level_values( 'scaffold').unique().to_list() self.scaffold_dict = None def read(self, in_file, format="gff", parsing_mode="only_coordinates", featuretype_separation=False, sort=False, black_list=(), white_list=()): if format not in self.parsing_parameters: raise ValueError( "ERROR!!! This format(%s) was not implemented yet for parsing!" % parsing_mode) elif parsing_mode not in self.parsing_parameters[format]: raise ValueError( "ERROR!!! This format(%s) was not implemented yet for parsing in this mode(%s)!" % (format, parsing_mode)) print("%s\tReading input..." % str(datetime.datetime.now())) self.records = pd.read_csv( in_file, sep='\t', header=None, na_values=".", comment="#", usecols=self.parsing_parameters[format][parsing_mode]["cols"], converters=self.parsing_parameters[format][parsing_mode] ["converters"], names=self.parsing_parameters[format][parsing_mode]["col_names"], index_col=self.parsing_parameters[format][parsing_mode] ["index_cols"]) if white_list or black_list: scaffolds_to_keep = self.get_filtered_entry_list( self.records.index, entry_black_list=black_list, entry_white_list=white_list) self.records = self.records[self.records.index.get_level_values( 'scaffold').isin(scaffolds_to_keep)] if self.scaffold_syn_dict: self.records["scaffold"].replace(self.scaffold_syn_dict, inplace=True) self.records.index = pd.MultiIndex.from_arrays( [self.records.index, np.arange(0, len(self.records))], names=("scaffold", "row")) print("%s\tReading input finished..." % str(datetime.datetime.now())) if parsing_mode in self.featuretype_parsing_modes: self.featuretype_list = list(self.records[["featuretype" ]].iloc[:, 0].unique()) if featuretype_separation: self.records = OrderedDict([ (featuretype, self.records[self.records["featuretype"] == featuretype]) for featuretype in self.featuretype_list ]) if parsing_mode in self.attributes_parsing_modes: retained_columns = deepcopy(self.parsing_parameters[self.format][ self.parsing_mode]["col_names"]) for entry in "attributes", "scaffold": retained_columns.remove(entry) if featuretype_separation and (parsing_mode in self.featuretype_parsing_modes): attributes_dict = self.parse_attributes() for featuretype in self.featuretype_list: #self.records[featuretype].columns = pd.MultiIndex.from_arrays([ # self.records[featuretype].columns, # self.records[featuretype].columns, # ]) self.records[featuretype] = pd.concat([ self.records[featuretype][retained_columns], attributes_dict[featuretype] ], axis=1) else: attributes = self.parse_attributes() #self.records.columns = pd.MultiIndex.from_arrays([ # self.records.columns, # self.records.columns, # ]) self.records = pd.concat( [self.records[retained_columns], attributes], axis=1) if sort: self.records.sort_values(by=["scaffold", "start", "end"]) def parse_column(self, column, param): #col.replace(self.default_replace_dict, inplace=True) if param not in self.converters: return column elif self.converters[param] == str: return column if self.converters[param] in self.pandas_int_type_correspondence: col = column.apply(self.pandas_int_type_correspondence[ self.converters[param]]).astype(self.converters[param]) else: col = column.apply(self.converters[param]) return col def parse_attributes(self): print("%s\tParsing attribute field..." % str(datetime.datetime.now())) if isinstance(self.records, (OrderedDict, dict)): tmp_attr_dict = OrderedDict() for entry in self.records: tmp_attr = map( lambda s: OrderedDict( map(lambda b: b.split("="), s.split(";"))), list(self.records[entry]["attributes"])) tmp_attr = pd.DataFrame(tmp_attr) shape = np.shape(tmp_attr) column_number = 1 if len(shape) == 1 else shape[1] #tmp_attr.columns = pd.MultiIndex.from_arrays([ # ["attributes"] * column_number, # tmp_attr.columns # ]) tmp_attr.index = self.records[entry].index tmp_attr_dict[entry] = tmp_attr print("%s\tParsing attribute field finished..." % str(datetime.datetime.now())) return tmp_attr_dict elif isinstance(self.records, (pd.DataFrame, )): tmp_attr = map( lambda s: OrderedDict(map(lambda b: b.split("="), s.split(";")) ), list(self.records["attributes"])) tmp_attr = pd.DataFrame(tmp_attr) shape = np.shape(tmp_attr) column_number = 1 if len(shape) == 1 else shape[1] #tmp_attr.columns = pd.MultiIndex.from_arrays([ # ["attributes"] * column_number, # tmp_attr.columns # ]) tmp_attr.index = self.records.index print("%s\tParsing attribute field finished..." % str(datetime.datetime.now())) return tmp_attr else: raise ValueError("ERROR!!! Unknown format of the records!") def get_attribute_names(self): if self.featuretype_separation: attributes_dict = OrderedDict() for feature in self.records: attributes_dict[feature] = list(self.records[feature][ AnnotationFormats.GFF_COLS["attributes"] - 1:]) # -1 is necessary as scaffold column is part of index return attributes_dict return list(self.records[AnnotationFormats.GFF_COLS["attributes"] - 1:] ) # -1 is necessary as scaffold column is part of index def total_length(self): return np.sum(self.records['end'] - self.records['start']) #def get_feature_length(self, output=None, featuretype_list=None): # # feature_records = self.records[self.records["featuretype"].isin(featuretype_list)] if featuretype_list else self.records def collapse_records(self, sort=True, verbose=True): """ strand-independent collapse :param sort: :param verbose: :return: """ if self.featuretype_separation: raise ValueError( "ERROR!!! Record collapse for parsing with feature separation was not implemented yet!" ) else: records_before_collapse = len(self.records) if sort: self.records.sort_values(by=["scaffold", "start", "end"]) row_list = [] for scaffold in self.scaffold_list: #print scaffold # check if there is only one record per scaffold, necessary as pandas will return interger instead of Series if len(self.records.loc[[scaffold]]) == 1: for row in self.records.loc[[scaffold ]].itertuples(index=True): row_list.append(list(row)) continue #print self.records.loc[scaffold] # remove nested records end_diff = self.records.loc[[scaffold]]['end'].diff() #print len(end_diff) end_diff[0] = 1 no_nested_records_df = self.records.loc[[scaffold ]][end_diff > 0] #print len(no_nested_records_df) # collapse overlapping records row_iterator = no_nested_records_df.itertuples(index=True) prev_row = list(row_iterator.readline()) for row in row_iterator: row_l = list(row) if row_l[self.record_start_col] > prev_row[ self.record_end_col]: row_list.append(prev_row) prev_row = row_l else: prev_row[self.record_end_col] = row_l[ self.record_end_col] row_list.append(prev_row) self.records = pd.DataFrame.from_records(row_list, columns=self.col_names, index=self.index_cols) if verbose: print( "Records before collapsing: %i\nRecords after collapsing: %i" % (records_before_collapse, len(self.records))) def remove_small_records(self, min_record_length): if self.featuretype_separation: raise ValueError( "ERROR!!! Removal of small records for parsing with feature separation " "was not implemented yet!") else: records_before_collapse = len(self.records) self.records = self.records[( self.records['end'] - self.records['start']) >= min_record_length] print("Records before filtering: %i\nRecords afterfiltering: %i" % (records_before_collapse, len(self.records))) def __add__(self, other): new_gff_record = CollectionGFF(records=pd.concat( [self.records, other.records]), in_file=None, format=self.format, parsing_mode=self.parsing_mode, black_list=self.black_list, white_list=self.white_list) new_gff_record.records = new_gff_record.records.sort_values( by=["scaffold", "start", "end"]) return new_gff_record def __radd__(self, other): new_gff_record = CollectionGFF(records=pd.concat( [other.records, self.records]), in_file=None, format=other.format, parsing_mode=other.parsing_mode, black_list=other.black_list, white_list=other.white_list) new_gff_record.records = new_gff_record.records.sort_values( by=["scaffold", "start", "end"]) return new_gff_record def sequence_generator(self, records, sequence_collection, expression=None): for entry in records.itertuples(): if expression: if not expression(entry): continue yield entry[self.record_id_col], sequence_collection[entry[ self.record_id_col]][ entry[self.record_start_col]:entry[self.record_end_col]] def get_introns(self, exon_feature="CDS", parent_id_field="Parent", id_field="ID", intron_id_prefix="intron", intron_id_digit_number=8): # TODO: CORRECT ERRORS FOR INTRONS IN - STRAND IN CASE WHEN EXONS ARE REVERSE SORTED BY COORDINATE if self.featuretype_separation: intron_index = 1 intron_id_template = "%s%%0%ii" % (intron_id_prefix, intron_id_digit_number) self.records["intron"] = self.records[exon_feature].copy(deep=True) self.records["intron"]["start"], self.records["intron"]["end"] = self.records["intron"]["end"], \ self.records["intron"]["start"].shift(periods=-1, fill_value=0) self.records["intron"].index = self.records[ "intron"].index.droplevel(level=1) self.records["intron"]["row"] = range(0, len(self.records["intron"])) self.records["intron"].set_index("row", append=True, inplace=True) self.records["intron"].drop(self.records["intron"].groupby( parent_id_field, sort=False).agg( {parent_id_field: 'count'})[parent_id_field].cumsum() - 1, level=1, inplace=True) intron_number = len(self.records["intron"]) self.records["intron"]["phase"] = 0 self.records["intron"]["featuretype"] = "intron" self.records["intron"][id_field] = [ intron_id_template % i for i in range(intron_index, intron_index + intron_number) ] self.records["intron"].index = self.records[ "intron"].index.droplevel(level=1) self.records["intron"]["row"] = range(0, len(self.records["intron"])) self.records["intron"].set_index("row", append=True, inplace=True) def write_introns(self, output): if "intron" in self.records: with open(output, "w") as out_fd: for row_tuple in self.records["intron"].copy( deep=True).reset_index(level="scaffold").itertuples( index=False): out_fd.write( "%s\t%i\t%i\t%s\t%i\t%s\n" % ("\t".join(row_tuple[:3]), row_tuple[3] + 1, row_tuple[4], "\t".join( row_tuple[5:7]), row_tuple[7], ";".join([ "%s=%s" % (self.records["intron"].columns[i], str(row_tuple[i])) for i in range( 8, len(self.records["intron"].columns)) ]))) else: raise ValueError("ERROR!!! No introns were found!") def write(self, output, output_format, source="custom", feature_type="region"): if self.format == "bed": if self.parsing_mode == "only_coordinates": if output_format == "bed": self.records.to_csv(output, sep="\t", index=True, header=False) elif output_format == "gff": entry_template = "%s\t%s\t%s\t%i\t%i\t.\t.\t.\t.\n" with open(output, "w") as out_fd: for record_tuple in self.records.reset_index( level=0).itertuples(index=False): out_fd.write( entry_template % (record_tuple[0], source, feature_type, record_tuple[1] + 1, record_tuple[2])) def extract_sequences_by_type(self, sequence_collection, record_type_black_list=[], record_type_white_list=[], return_type="collection", records_parsing_type="parse"): if self.parsing_mode in self.featuretype_parsing_modes: if return_type == "collection": selected_records = self.records[ self.records.index.isin(record_type_white_list, level=1) & (~self.records.index.isin(record_type_black_list, level=1) )] from RouToolPa.Parsers.Sequence import CollectionSequence extracted_records = CollectionSequence() else: pass
tx_state_code.append(seseds["StateCode"][i]) tx_year.append(seseds["Year"][i]) tx_data.append(seseds["Data"][i]) az_comp_data = OrderedDict() ca_comp_data = OrderedDict() nm_comp_data = OrderedDict() tx_comp_data = OrderedDict() item_dict = OrderedDict() item_dict["MSN"] = az_msn item_dict["StateCode"] = az_state_code item_dict["Year"] = az_year item_dict["Data"] = az_data az_comp_data = pd.DataFrame(item_dict) az_comp_data.to_csv( "C:\\Users\\THINKPAD\\PycharmProjects\\MCM-ICM-2018-Problem-C\\code\\PCR\\az_data.csv", index=False, index_label=False, sep=',') item_dict["MSN"] = ca_msn item_dict["StateCode"] = ca_state_code item_dict["Year"] = ca_year item_dict["Data"] = ca_data ca_comp_data = pd.DataFrame(item_dict) ca_comp_data.to_csv( "C:\\Users\\THINKPAD\\PycharmProjects\\MCM-ICM-2018-Problem-C\\code\\PCR\\ca_data.csv", index=False, index_label=False, sep=',') item_dict["MSN"] = nm_msn item_dict["StateCode"] = nm_state_code item_dict["Year"] = nm_year
tblOut['ext_arm_len'].append( abs(ea.end-ea.start)+1 ) tblOut['lig_arm_len'].append( abs(la.end-la.start)+1 ) ## tblOut['ext_nmerfreq'].append( ea.arm_mean_kmer_freq ) tblOut['lig_nmerfreq'].append( la.arm_mean_kmer_freq ) tblOut['ext_tm'].append( ea.arm_tm ) tblOut['lig_tm'].append( la.arm_tm ) tblOut['ext_mrfAndBwa_exact'].append( eah.num_exact_hits ) tblOut['ext_mrfAndBwa_close'].append( eah.num_close_hits ) tblOut['ext_mrfAndBwa_all'].append( eah.num_all_hits ) tblOut['lig_mrfAndBwa_exact'].append( lah.num_exact_hits ) tblOut['lig_mrfAndBwa_close'].append( lah.num_close_hits ) tblOut['lig_mrfAndBwa_all'].append( lah.num_all_hits ) tblOut['ext_gc'].append( ea.arm_gc ) tblOut['lig_gc'].append( la.arm_gc ) # tblOut = pd.DataFrame( tblOut ) tblOut.to_csv( o.mipTableOut, sep='\t', index=False )
def analyse_all_positions(): session = db.Session() gameFrames = {} # replayid:[(start,end),..] replays = session.query(db.Replay).all() for replay in replays: frames = get_game_frames(replay) gameFrames[replay.id] = frames # print(gameFrames) # count = session.query(db.Player.rplayer_id, sq.sql.func.count('*').label('replay_count')).group_by(db.Player.rplayer_id).subquery() # count_positions = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('positions')).join(db.Player).group_by(db.Player.rplayer_id).subquery() # count_game_positions = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('game_positions')).join(db.Player).filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0],frames[1]) for frames in gameFrames[replayid]),db.Position.replay_id==replayid) for replayid in gameFrames)).group_by(db.Player.rplayer_id).subquery() # count_half_attacking = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('attacking_half')).join(db.Player).filter(db.Position.y>0).group_by(db.Player.rplayer_id).subquery() # count_half_defending = session.query(db.Position, db.Player.rplayer_id, sq.sql.func.count('*').label('defending_half')).join(db.Player).filter(db.Position.y<0).group_by(db.Player.rplayer_id).subquery() print('hi') a = time.time() rplayers = session.query(db.RPlayer).all() # i=0 for rplayer in tqdm.tqdm(rplayers, leave=True): # i += 1 # if i>2: break rpid = rplayer.id # print('\n',rid) pids, rids = zip(*session.query(db.Player.id, db.Player.replay_id).filter(db.Player.rplayer_id == rpid).all()) print(pids, rids) _gameFrames = {} for rid in rids: _gameFrames[rid] = gameFrames[rid] # limit to 5 games _temp_gameFrames = {} i = 0 for rid in rids: if i > 5: break _temp_gameFrames[rid] = gameFrames[rid] i += 1 print(_temp_gameFrames) p = session.query( db.Position, db.BallFrame.x, db.BallFrame.y, db.BallFrame.z, db.Team.colour, # db.Velocity.speed )\ .filter(db.Position.player_id.in_(pids))\ .filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0], frames[1]) for frames in _temp_gameFrames[replayid]), db.Position.replay_id == replayid) for replayid in _temp_gameFrames))\ .join(db.Player).join(db.Team)\ .join(db.BallFrame, db.BallFrame.frame_id == db.Position.frame_id)\ # .join(db.Velocity, sq.and_(db.Position.frame_id==db.Velocity.frame_id,db.Position.player_id==db.Velocity.player_id))\ # velocity join takes a long time. # TESTING ABOVE # p = session.query( # db.Position, # db.BallFrame.x, # db.BallFrame.y, # db.BallFrame.z, # db.Team.colour, # db.Velocity.speed # )\ # .filter(db.Position.player_id.in_(pids))\ # .filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0],frames[1]) for frames in _gameFrames[replayid]),db.Position.replay_id==replayid) for replayid in _gameFrames))\ # .join(db.Velocity, sq.and_(db.Position.frame_id==db.Velocity.frame_id,db.Position.player_id==db.Velocity.player_id))\ # .join(db.Player).join(db.Team)\ # .join(db.BallFrame,db.BallFrame.frame_id==db.Position.frame_id)\ # END REAL PART # p = session.query(db.Position,db.BallFrame.x,db.BallFrame.y,db.BallFrame.z,db.Team.colour)\ # .filter(db.Position.player_id.in_(pids))\ # .filter(sq.or_(sq.and_(sq.or_(db.Position.frameNo.between(frames[0],frames[1]) for frames in _gameFrames[replayid]),db.Position.replay_id==replayid) for replayid in _gameFrames))\ # .join(db.Player).join(db.Team)\ # .join(db.BallFrame,db.BallFrame.frame_id==db.Position.frame_id)\ positions = pd.read_sql_query( p.selectable, db.engine ) # print(positions.columns.values) # print('\n\n\n') # print(positions[:5]) # print('\n\n\n') # print(positions.describe()) print(positions, "!!!!!!!") _positions1 = analyse_position(positions) _positions2 = analyse_position_velocity(positions) # _positions = {**(_positions1), **_positions2} _positions = OrderedDict() for key, value in _positions1.items(): _positions[key] = value for key, value in _positions2.items(): _positions[key] = value # print(_positions) try: for key, value in _positions.items(): positional_analysis[key].append(value) positional_analysis['name'].append(rplayer.name) positional_analysis['team'].append(rplayer.rteam.name) positional_analysis['games'].append(len(pids)) except UnboundLocalError: positional_analysis = OrderedDict() positional_analysis['name'] = [rplayer.name] positional_analysis['team'] = [rplayer.rteam.name] positional_analysis['games'] = [len(pids)] for key, value in _positions.items(): positional_analysis[key] = [value] # print(positional_analysis) positional_analysis = pd.DataFrame.from_dict(positional_analysis) # print(positional_analysis) # replay = pd.read_sql_query( # x.selectable, # db.engine # ) # replay.columns = replay.columns.str.replace('^anon_[0-9]+_','') print('duration:', int(time.time() - a)) with open("all_player_position_analysis1.txt", 'w') as f: positional_analysis.to_csv(f, index=False) print('done')
class FractionTaxaBarStack(Graph): short_name = 'fraction_taxa_barstack' bottom = 0.4 top = 0.95 left = 0.1 right = 0.95 formats = ('pdf', 'eps') def plot(self): # Make Frame # self.frame = OrderedDict( (('%s - %s' % (p, f), getattr(p.fractions, f).rdp.phyla) for f in ('low', 'med', 'big') for p in self.parent.pools)) self.frame = pandas.DataFrame(self.frame) self.frame = self.frame.fillna(0) # Rename # new_names = { u"run001-pool01 - low": "2-step PCR low", u"run001-pool02 - low": "2-step PCR low", u"run001-pool03 - low": "2-step PCR low", u"run001-pool04 - low": "1-step PCR low", u"run002-pool01 - low": "New chem low", u"run001-pool01 - med": "2-step PCR med", u"run001-pool02 - med": "2-step PCR med", u"run001-pool03 - med": "2-step PCR med", u"run001-pool04 - med": "1-step PCR med", u"run002-pool01 - med": "New chem med", u"run001-pool01 - big": "2-step PCR high", u"run001-pool02 - big": "2-step PCR high", u"run001-pool03 - big": "2-step PCR high", u"run001-pool04 - big": "1-step PCR high", u"run002-pool01 - big": "New chem high", } self.frame.rename(columns=new_names, inplace=True) self.frame = self.frame.transpose() # Group low abundant into 'others' # low_abundance = self.frame.sum() < 30000 other_count = self.frame.loc[:, low_abundance].sum(axis=1) self.frame = self.frame.loc[:, ~low_abundance] self.frame['Others'] = other_count # Normalize # self.frame = self.frame.apply(lambda x: 100 * x / x.sum(), axis=1) # Sort the table by sum # sums = self.frame.sum() sums.sort(ascending=False) self.frame = self.frame.reindex_axis(sums.keys(), axis=1) # Plot # fig = pyplot.figure() axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors) fig = pyplot.gcf() # Other # axes.set_ylabel('Relative abundances in percent') axes.xaxis.grid(False) axes.yaxis.grid(False) axes.set_ylim([0, 100]) # Put a legend below current axis axes.legend(loc='upper center', bbox_to_anchor=(0.5, -0.40), fancybox=True, shadow=True, ncol=5, prop={'size': 10}) # Font size # axes.tick_params(axis='x', which='major', labelsize=11) # Save it # self.save_plot(fig, axes) self.frame.to_csv(self.csv_path) pyplot.close(fig)
for i in variable1: if i not in variable: variable.append(i) tx_data = OrderedDict() for i in range(0, len(variable)): tx_data[variable[i]] = np.zeros(50) for i in range(0, 50): for j in range(len(data)): if int(data["Year"][j]) - 1960 == i: if data["MSN"][j] in variable: tx_data[data["MSN"][j]][i] = data["Data"][j] year = [] tx_comp_data = OrderedDict() tx_comp_data = pd.DataFrame(tx_data) tx_comp_data.to_csv("tx_data_by_year_original.csv", index=False, index_label=False, sep=',') for i in variable: if i != "TEGDS" and i != "Year": mean = np.mean(tx_data[i]) std = np.std(tx_data[i]) if std != 0: for j in range(len(tx_data[i])): tx_data[i][j] = (tx_data[i][j] - mean) / std tx_comp_data = pd.DataFrame(tx_data) tx_comp_data.to_csv("tx_data_by_year.csv", index=False, index_label=False, sep=',')
def get_stats_from_coverage_file_stream_version(self, coverage_file, output_prefix, verbose=True, scaffold_column=0, coverage_column=1, separator="\t", buffering=None): stats = OrderedDict() summary_stats = OrderedDict() with self.metaopen(coverage_file, "r", buffering=buffering) as in_fd: line_list = in_fd.readline().strip().split(separator) scaffold, coverage = line_list[scaffold_column], int(line_list[coverage_column]) coverage_dict = OrderedDict([(coverage, 1)]) summary_coverage_dict = OrderedDict([(coverage, 1)]) current_scaffold = scaffold line_counter = 1 for line in in_fd: line_list = line.strip().split(separator) scaffold, coverage = line_list[scaffold_column], int(line_list[coverage_column]) if coverage in summary_coverage_dict: summary_coverage_dict[coverage] += 1 else: summary_coverage_dict[coverage] = 1 line_counter += 1 if line_counter % 1000000 == 0: print("%s\tProcessed %i lines" % (str(datetime.datetime.now()), line_counter)) if scaffold != current_scaffold: #print(scaffold) print("%s\tCalculating stats for %s" % (str(datetime.datetime.now()), current_scaffold)) stats[current_scaffold] = [sum(list(coverage_dict.values())), min(list(coverage_dict.keys())), max(list(coverage_dict.keys())), self.mean_from_dict(coverage_dict), self.median_from_dict(coverage_dict)] coverage_dict = OrderedDict([(coverage, 1)]) current_scaffold = scaffold else: if coverage in coverage_dict: coverage_dict[coverage] += 1 else: coverage_dict[coverage] = 1 else: #print("END") #print(scaffold) stats[current_scaffold] = [sum(list(coverage_dict.values())), min(list(coverage_dict.keys())), max(list(coverage_dict.keys())), self.mean_from_dict(coverage_dict), self.median_from_dict(coverage_dict)] summary_stats["all"] = [sum(list(summary_coverage_dict.values())), min(list(summary_coverage_dict.keys())), max(list(summary_coverage_dict.keys())), self.mean_from_dict(summary_coverage_dict), self.median_from_dict(summary_coverage_dict)] #print(stats) stats = pd.DataFrame.from_dict(stats, orient="index", columns=["length", "min", "max", "mean", "median"]) summary_stats = pd.DataFrame.from_dict(summary_stats, orient="index", columns=["length", "min", "max", "mean", "median"]) stats.to_csv("%s.per_scaffold.stat" % output_prefix, sep="\t", index_label="#scaffold") summary_stats.to_csv("%s.all.stat" % output_prefix, sep="\t", index_label="#scaffold") if verbose: print(stats)
y_final_pred = classifier.predict(x_testdata) t = testing_data['PERID'] test = [] yp = [] for i in range(0, 11430): test.append(t[i]) yp.append(y_final_pred[i]) from collections import OrderedDict df = OrderedDict([('PERID', test), ('Criminal', yp)]) df = pd.DataFrame.from_dict(df) df.to_csv('first.csv', index=False) # n_estimators = 300 , max_depth = 3 , min_child_weight = 3 # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV parameters = [{'learning_rate': [0.1, 0.2, 0.3]}] grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, return_train_score=False) grid_search = grid_search.fit(X_train, y_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_
def get_coverage_stats_in_windows(self, coverage_file, window_size, output_prefix, window_step=None, buffering=None): win_step = window_size if window_step is None else window_step stats = [] per_scaffold_stats = OrderedDict() coverage_dict = OrderedDict() summary_stats = OrderedDict() total_length = 0 with self.metaopen(coverage_file, "r", buffering=buffering) as in_fd: prev_scaffold, start, end, coverage = in_fd.readline().strip( ).split() coverage_list = [int(coverage)] * (int(end) - int(start)) for line in in_fd: current_scaffold, start, end, coverage = line.strip().split() if current_scaffold == prev_scaffold: coverage_list += [int(coverage)] * (int(end) - int(start)) else: scaffold_length = len(coverage_list) if scaffold_length >= window_size: number_of_windows = int( (scaffold_length - window_size) / win_step) + 1 for i in range(0, number_of_windows): win_start = i * win_step window_coverage_list = coverage_list[ win_start:win_start + window_size] uncovered = window_coverage_list.count(0) stats.append([ prev_scaffold, scaffold_length, i, np.mean(window_coverage_list), np.median(window_coverage_list), np.min(window_coverage_list), np.max(window_coverage_list), uncovered, float(uncovered) / float(window_size) ], ) coverage_array, count_array = np.unique(coverage_list, return_counts=True) for i in range(0, len(coverage_array)): if coverage_array[i] in coverage_dict: coverage_dict[coverage_array[i]] += count_array[i] else: coverage_dict[coverage_array[i]] = count_array[i] per_scaffold_stats[prev_scaffold] = [ scaffold_length, min(coverage_list), max(coverage_list), np.mean(coverage_list), np.median(coverage_list) ] prev_scaffold = current_scaffold coverage_list = [int(coverage)] * (int(end) - int(start)) total_length += scaffold_length scaffold_length = len(coverage_list) total_length += scaffold_length if scaffold_length >= window_size: number_of_windows = int( (scaffold_length - window_size) / win_step) + 1 for i in range(0, number_of_windows): win_start = i * win_step window_coverage_list = coverage_list[win_start:win_start + window_size] uncovered = window_coverage_list.count(0) stats.append([ prev_scaffold, scaffold_length, i, np.mean(window_coverage_list), np.median(window_coverage_list), np.min(window_coverage_list), np.max(window_coverage_list), uncovered, float(uncovered) / float(window_size) ], ) per_scaffold_stats[prev_scaffold] = [ scaffold_length, min(coverage_list), max(coverage_list), np.mean(coverage_list), np.median(coverage_list) ] coverage_array, count_array = np.unique(coverage_list, return_counts=True) for i in range(0, len(coverage_array)): if coverage_array[i] in coverage_dict: coverage_dict[coverage_array[i]] += count_array[i] else: coverage_dict[coverage_array[i]] = count_array[i] stats = pd.DataFrame.from_records( stats, index=("scaffold", "window"), columns=("scaffold", "scaffold_length", "window", "mean", "median", "min", "max", "uncovered", "uncovered,fraction")) summary_stats["all"] = [ total_length, min(list(coverage_dict.keys())), max(list(coverage_dict.keys())), self.mean_from_dict(coverage_dict), self.median_from_dict(coverage_dict) ] summary_stats = pd.DataFrame.from_dict( summary_stats, orient="index", columns=["length", "min", "max", "mean", "median"]) per_scaffold_stats = pd.DataFrame.from_dict( per_scaffold_stats, orient="index", columns=["length", "min", "max", "mean", "median"]) stats.to_csv("{0}.win{1}.step{2}.stat".format(output_prefix, window_size, win_step), sep="\t", header=True, index=True) summary_stats.to_csv("%s.all.stat" % output_prefix, sep="\t", index_label="#scaffold") per_scaffold_stats.to_csv("%s.per_scaffold.stat" % output_prefix, sep="\t", index_label="#scaffold")
te_data = OrderedDict() item_dict = OrderedDict() item_dict["MSN"] = te_msn item_dict["Description"] = te_description item_dict["Unit"] = te_unit te_data = pd.DataFrame(item_dict) tn_data = OrderedDict() item_dict = OrderedDict() item_dict["MSN"] = tn_msn item_dict["Description"] = tn_description item_dict["Unit"] = tn_unit tn_data = pd.DataFrame(item_dict) # data_frame.to_csv("C:\\Users\\THINKPAD\\PycharmProjects\\MCM-ICM2018\\data\\test.csv",index=False,index_label=False,sep=',') comp_data.to_csv("data/csv/total_sector.csv", index=False, index_label=False, sep=',') tn_data.to_csv("data/csv/tn_sector.csv", index=False, index_label=False, sep=',') te_data.to_csv("data/csv/te_sector.csv", index=False, index_label=False, sep=',') print(comp_data) print(tn_data) print(te_data)
b_names = [ 'TRAIN_TT', 'SM_TT', 'CAR_TT', 'TRAIN_HE', 'SM_HE', 'SM_SEATS', 'TRAIN_ONE', 'SM_ONE' ] b_mean = OrderedDict(zip(b_names, b.mean(0))) b_std = OrderedDict(zip(b_names, b.std(0))) b_max = OrderedDict(zip(b_names, b.max(0)[0])) b_min = OrderedDict(zip(b_names, b.min(0)[0])) b_mean = bToDataFrame(b_mean) b_std = bToDataFrame(b_std) b_max = bToDataFrame(b_max) b_min = bToDataFrame(b_min) b_mean.to_csv(args.result_path + "/" + "b_mean.csv") b_std.to_csv(args.result_path + "/" + "b_std.csv") b_max.to_csv(args.result_path + "/" + "b_max.csv") b_min.to_csv(args.result_path + "/" + "b_min.csv") print "\nb_mean" print b_mean print "\nb_std" print b_std print "\nb_max" print b_max print "\nb_min" print b_min
file_in = sys.argv[1] file_out1 = sys.argv[2] file_out2 = sys.argv[3] file_out3 = sys.argv[4] K = 5 df = pd.read_parquet(file_in) rand_idps = np.random.randint(0, df.shape[1] - 1, size=K) + 1 # the offset is indiv col pheno_names = [] out = OrderedDict() out['indiv'] = df.indiv for i, k in enumerate(list(rand_idps)): name = '{}_{}'.format(i, df.columns[k]) tmp = df.iloc[:, k].values tmp = tmp - tmp.mean() out[name] = tmp + np.random.normal(scale=4 * tmp.std(), size=df.shape[0]) + np.random.randint(-5, 5, size=1) name = '{}_null'.format(i, df.columns[k]) out[name] = np.random.normal(scale=4 * tmp.std(), size=df.shape[0]) + np.random.randint(-5, 5, size=1) out = pd.DataFrame(out) out.to_csv(file_out1, index=False) with open(file_out2, 'w') as f: yaml.dump({ k: 'linear_regression' for k in out.keys()[1:] }, f) with open(file_out3, 'w') as f: yaml.dump({ k: 'susie' for k in out.keys()[1:] }, f)
verbose=not args.Q) read_stats['tag'] = tag base_stats = read_stats['base_stats'] precision_stats = read_stats['read_stats'] base_stats_qc(base_stats, plotter) modes = read_precision_qc(precision_stats, plotter) plotter.close() global_stats = OrderedDict([ ('Accuracy', [read_stats['base_stats']['accuracy']]), ('AccuracyMode', modes['accuracy_mode']), ('Identity', [read_stats['base_stats']['identity']]), ('IdentityMode', modes['identity_mode']), ('Mapped', [read_stats['mapped']]), ('Unmapped', [read_stats['unmapped']]), ('Tag', [read_stats['tag']]), ]) global_stats = pd.DataFrame(global_stats) if args.g is not None: global_stats.to_csv(args.g, sep="\t", index=False) if args.l is not None: read_df = pd.DataFrame(precision_stats) read_df.to_csv(args.l, sep="\t", index=False) if args.p is not None: misc.pickle_dump(read_stats, args.p)
] for o in onlyfileszn: if '.csv' not in o: i2 = onlyfileszn.index(o) path5 = path4 + '/biextracted/' + str(i2) zip2 = zipfile.ZipFile(path4 + '/' + o) zip2.extractall(path5) #inside_path5 = [iif for iif in listdir(path5) if isfile(join(path5, iif))] inside_path5 = zip2.namelist() T1 = [di for di in inside_path5 if 'T1' in di] ### get date and pod for T in T1: dt = datetime.datetime(y, int(T[2:4]), int(T[5:7])) pod = T[T.find('_') + 1:T.find('.csv')] s = zip2.read(T) # t1df = pd.read_csv(path5 + '/' + T1, sep = ';', dtype = object) todiz = [pod, dt, s[548:741]] # todiz.extend(Aggregator(t1df).tolist()) df[count] = todiz count += 1 #### http://stackoverflow.com/questions/303200/how-do-i-remove-delete-a-folder-that-is-not-empty-with-python zip2.close() shutil.rmtree(path5) zip_ref.close() shutil.rmtree(path4) df = pd.DataFrame.from_dict(df, orient='index') df.to_csv('Hdatabase_' + str(y), sep=';') del df #### copy all files into a new directory and then operate in the new directory #### https://docs.python.org/2/library/shutil.html
tx_state_code.append(seseds["StateCode"][i]) tx_year.append(seseds["Year"][i]) tx_data.append(seseds["Data"][i]) az_comp_data = OrderedDict() ca_comp_data = OrderedDict() nm_comp_data = OrderedDict() tx_comp_data = OrderedDict() item_dict = OrderedDict() item_dict["MSN"] = az_msn item_dict["StateCode"] = az_state_code item_dict["Year"] = az_year item_dict["Data"] = az_data az_comp_data = pd.DataFrame(item_dict) az_comp_data.to_csv( "C:/Users/THINKPAD/PycharmProjects/MCM-ICM-2018-Problem-C/data/csv/az_data.csv", index=False, index_label=False, sep=',') item_dict["MSN"] = ca_msn item_dict["StateCode"] = ca_state_code item_dict["Year"] = ca_year item_dict["Data"] = ca_data ca_comp_data = pd.DataFrame(item_dict) ca_comp_data.to_csv( "C:/Users/THINKPAD/PycharmProjects/MCM-ICM-2018-Problem-C/data/csv/ca_data.csv", index=False, index_label=False, sep=',') item_dict["MSN"] = nm_msn item_dict["StateCode"] = nm_state_code item_dict["Year"] = nm_year
dt = datetime.datetime(y, int(T[2:4]), int(T[5:7])) pod = T[T.find('_') + 1:T.find('.csv')] s = zip2.read(T) # t1df = pd.read_csv(path5 + '/' + T1, sep = ';', dtype = object) todiz = [pod, dt, s[548:741]] # todiz.extend(Aggregator(t1df).tolist()) df[count] = todiz count += 1 #### http://stackoverflow.com/questions/303200/how-do-i-remove-delete-a-folder-that-is-not-empty-with-python zip2.close() shutil.rmtree(path5) zip_ref.close() shutil.rmtree(path4) df = pd.DataFrame.from_dict(df, orient='index') df.to_csv('Hdatabase_' + str(y), sep=';') del df #### copy all files into a new directory and then operate in the new directory #### https://docs.python.org/2/library/shutil.html ############################################################################### extracter = 'C:/Users/d_floriello/Desktop/tbe2' extracter = 'H:/Energy Management/02. EDM/01. MISURE/3. DISTRIBUTORI/ENEL Distribuzione S.p.A/2017/2017-03/giornalieri/csv' onlyfiles = [f for f in listdir(extracter) if isfile(join(extracter, f))] for of in onlyfiles: if '.zip' in of: zip_ref = zipfile.ZipFile(extracter + '/' + of) zip_ref.extractall(extracter) zip_ref.close()
max_val = (2 * d - 1) * np.max( [(2 * d - 1) * m for m, n in zip(metrics[col], metrics['Model']) if n != 'Train']) metrics[col] = [ str(x) if x != max_val or n == 'Train' else bf_pattern.format(x) for x, n in zip(metrics[col], metrics['Model']) ] for col in targets[::-1]: metrics[col] = [ it_pattern.format(x) if n == 'Train' else x for x, n in zip(metrics[col], metrics['Model']) ] metrics = metrics.round(config.precision) if config.extension == 'csv': metrics.to_csv(config.output, index=None) elif config.extension == 'html': html = metrics.to_html(index=None) html = re.sub('<', '<', html) html = re.sub('>', '>', html) header, footer = html.split('</thead>') header += '</thead>' header = header.split('\n') values = [x.strip()[4:-5] for x in header[3:-2]] spans = ['rowspan' if '/' not in x else 'colspan' for x in values] first_header = [x.split('/')[0] for x in values] second_header = [x.split('/')[1] for x in values if '/' in x] new_header = header[:3] i = 0 total = 0 while i < len(first_header):
repeat=REPEAT, ) col[key] = 1000 * min(t) table["NumPy"] = pd.Series(col) final = table func_list = list(funcs.keys()) table = pd.DataFrame(final) table = table.reindex(table.mean(1).sort_values().index) order = np.log(table).mean().sort_values().index table = table.T table = table.reindex(order, axis=0) table = table.reindex(func_list, axis=1) table = 1000000 * table / (SIZE * NUMBER) table.index.name = "Bit Gen" print(table.to_csv(float_format="%0.1f")) try: from tabulate import tabulate perf = table.applymap(lambda v: "{0:0.1f}".format(v)) print(tabulate(perf, headers="keys", tablefmt="rst")) except ImportError: pass table = table.T rel = table.loc[:, ["NumPy"]].values @ np.ones((1, table.shape[1])) / table rel.pop("NumPy") rel = rel.T rel["Overall"] = np.exp(np.log(rel).mean(1)) rel *= 100
opts.add_argument('--mipGenOut', default=None, dest='mipGenOut') o = opts.parse_args() pairStore = pd.HDFStore(o.inStorePairs, 'r') tblPairs = pairStore[o.tablePairs] (tblnameExtArm, tblnameLigArm) = pairStore.get_storer( o.tablePairs).attrs['mippipe__ext_and_lig_arm_tables'] armStore = pd.HDFStore(o.inStoreArms, 'r') tblArmExt = armStore[tblnameExtArm] tblArmLig = armStore[tblnameLigArm] tblOut = OrderedDict([(col, []) for col in [ 'mip_key', 'chr', 'ext_probe_start', 'ext_probe_stop', 'lig_probe_start', 'lig_probe_stop' ]]) for i, r in tblPairs.iterrows(): tblOut['mip_key'].append(int(i)) tblOut['chr'].append(r.chrom) tblOut['ext_probe_start'].append(tblArmExt.ix[r.extarmidx, 'start']) tblOut['ext_probe_stop'].append(tblArmExt.ix[r.extarmidx, 'end']) tblOut['lig_probe_start'].append(tblArmLig.ix[r.ligarmidx, 'start']) tblOut['lig_probe_stop'].append(tblArmLig.ix[r.ligarmidx, 'end']) tblOut = pd.DataFrame(tblOut) tblOut.to_csv(o.mipGenOut, sep='\t', index=False)
###Extract specifications for all unique cars search_texts = ['Car', 'Power', 'Torque', 'Car type', 'Curb weight', 'Dimensions', 'Wheelbase', \ "Power / weight", 'Introduced', 'Origin country', 'Engine type', 'Displacement', \ "Power / liter", 'Transmission', 'Layout', 'Top speed', "1/4 mile", 'car URL'] #Make an ordered list of tuples to create a dictionary with ordered keys car_variables = [ ('Car',[]), ('Power', []), ('Torque', []), ('Type',[]), ('Weight', []), ('Dimensions', []), \ ('Wheelbase',[]), ('Power Per Weight', []), ('Year Model', []), ('Country', []), ('Engine Type', []), ('Displacement', []), \ ('Power Per Liter', []), ('Transmission', []), ('Layout', []), ('Top Speed', []), ('Quarter Mile Time', []), ('Car URL', [])] car_specs = OrderedDict(car_variables) for i in cars.index.values: #Go to each car specs HTML page car_url = "http://" + cars.iloc[i, 1] r = requests.get(car_url) soup = BeautifulSoup(r.text) #Extract each car specs car_specs['Car'].append(cars.iloc[i,0]) car_specs['Car URL'].append(car_url) for idx in range(1,3): #Extract power and torque info try: car_specs[car_specs.keys()[idx]].append(soup.find(text=search_texts[idx]).findNext('td').find('a').string) except: car_specs[car_specs.keys()[idx]].append(None) #Return null if blank for idx in range(3, len(car_variables)-1): #Extract info for everything else try: car_specs[car_specs.keys()[idx]].append(soup.find(text=search_texts[idx]).findNext('td').string) except: car_specs[car_specs.keys()[idx]].append(None) #Return null if blank #Transform the dictionary into a dataframe car_specs = pd.DataFrame(car_specs) #Save the dataframe into a CSV file car_specs.to_csv('cars_specifications.csv', index=False, encoding = 'UTF-8')
from numpy.random import RandomState rg = RandomState() """ col = {} for key in npfuncs: t = repeat(test.format(func=npfuncs[key]), setup.format(prng=prng().__class__.__name__), number=1, repeat=3) col[key] = 1000 * min(t) table['RandomState'] = pd.Series(col) table = pd.DataFrame(table) table = table.reindex(table.mean(1).sort_values().index) order = np.log(table).mean().sort_values().index table = table.T table = table.reindex(order) table = table.T table = table.reindex([k for k in funcs], axis=0) print(table.to_csv(float_format='%0.1f')) rel = table.loc[:, ['RandomState']].values @ np.ones( (1, table.shape[1])) / table rel.pop('RandomState') rel = rel.T rel['Overall'] = np.exp(np.log(rel).mean(1)) rel *= 100 rel = np.round(rel) rel = rel.T print(rel.to_csv(float_format='%0d'))
'xoroshiro128plus': 'xoroshiro128+', 'xorshift1024': 'xorshift1024', 'pcg64': 'PCG64', 'mt19937': 'MT19937', 'random': 'NumPy MT19937' } results.columns = [cols[c] for c in results] results.index = [index[i] for i in results.index] print(results) from io import StringIO sio = StringIO() results.to_csv(sio) sio.seek(0) lines = sio.readlines() for i, line in enumerate(lines): if i == 0: line = ' :header: ' + line else: line = ' ' + line lines[i] = line lines.insert(1, ' \n') lines.insert(1, ' :widths: 14,14,14,14,14,14,14,14\n') lines.insert(0, '.. csv-table::\n') print(''.join(lines)) std_results = (results.T / results.iloc[:, -3]).T
def reducer(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG import mapreduce as GLOBAL output_permutations = GLOBAL.OUTPUT_PERMUTATIONS map_output = GLOBAL.MAP_OUTPUT output_path = GLOBAL.OUTPUT_PATH roi = GLOBAL.ROI BASE = os.path.join("/neurospin/brainomics/2014_deptms/results_enettv/", "MRI_" + roi, map_output) INPUT = BASE + "/%i/%s" OUTPUT = BASE + "/../" + output_path if not os.path.exists(OUTPUT): os.makedirs(OUTPUT) criteria = GLOBAL.CRITERIA keys = ['_'.join(str(e) for e in a) for a in criteria] OK = 0 # params = criteria = ['recall_mean', 'min_recall', 'max_pvalue_recall', # 'accuracy', 'pvalue_accuracy'] if not OK: for key in keys: print "key: ", key paths_CV_all = [INPUT % (perm, key) \ for perm in xrange(NFOLDS * NRNDPERMS)] idx_CV_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS) recall_0_perms = np.zeros(NRNDPERMS) recall_1_perms = np.zeros(NRNDPERMS) recall_mean_perms = np.zeros(NRNDPERMS) accuracy_perms = np.zeros(NRNDPERMS) auc_perms = np.zeros(NRNDPERMS) crit = key[0:len(key):2] if not os.path.isfile(OUTPUT + \ "/perms_validation_" + crit + ".npz"): for perm in xrange(NRNDPERMS): print "perm: ", perm paths_CV_blocks = paths_CV_all[idx_CV_blocks[perm]:\ idx_CV_blocks[perm + 1]] values = [GLOBAL.OutputCollector(p) \ for p in paths_CV_blocks] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] prob_pred = [item["proba_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) success = r * s success = success.astype('int') accuracy = (r[0] * s[0] + r[1] * s[1]) accuracy = accuracy.astype('int') recall_0_perms[perm] = r[0] recall_1_perms[perm] = r[1] recall_mean_perms[perm] = r.mean() accuracy_perms[perm] = accuracy / float(s[0] + s[1]) auc_perms[perm] = auc # END PERMS print "save", crit np.savez_compressed(OUTPUT + \ "/perms_validation_" + crit + ".npz", recall_0=recall_0_perms, recall_1=recall_1_perms, recall_mean=recall_mean_perms, accuracy=accuracy_perms, auc=auc_perms) OK = 1 #pvals if not os.path.isfile(os.path.join(OUTPUT, output_permutations)): print "Derive p-values" perms = dict() for i, key in enumerate(keys): print "crit: ", crit crit = key[0:len(key):2] perms[crit] = np.load(OUTPUT + \ "/perms_validation_" + crit + ".npz") print keys [recall_mean, min_recall, accuracy] = [keys[0][0:len(keys[0]):2], keys[1][0:len(keys[1]):2], keys[2][0:len(keys[2]):2]] print [recall_mean, min_recall, accuracy] # Read true scores true = pd.read_csv(os.path.join(BASE, "..", "results_dCV_validation.csv")) true_recall_mean = true[true.params == recall_mean].iloc[0] true_min_recall = true[true.params == min_recall].iloc[0] true_accuracy = true[true.params == accuracy].iloc[0] # pvals corrected for multiple comparisons nperms = float(len(perms[recall_mean]['recall_0'])) from collections import OrderedDict pvals = OrderedDict() #cond: criterion used to select the model pvals["cond"] = ['recall_mean'] * 5 + ['min_recall'] * 5 + \ ['accuracy'] * 5 #stat: statitics associated to the p-value pvals["stat"] = ['recall_0', 'recall_1', 'recall_mean', 'accuracy', 'auc'] * 3 pvals["pval"] = [ np.sum(perms[recall_mean]['recall_0'] > true_recall_mean["recall_0"]), np.sum(perms[recall_mean]['recall_1'] > true_recall_mean["recall_1"]), np.sum(perms[recall_mean]['recall_mean'] > true_recall_mean["recall_mean"]), np.sum(perms[recall_mean]['accuracy'] > true_recall_mean["accuracy"]), np.sum(perms[recall_mean]['auc'] > true_recall_mean["auc"]), np.sum(perms[min_recall]['recall_0'] > true_min_recall["recall_0"]), np.sum(perms[min_recall]['recall_1'] > true_min_recall["recall_1"]), np.sum(perms[min_recall]['recall_mean'] > true_min_recall["recall_mean"]), np.sum(perms[min_recall]['accuracy'] > true_min_recall["accuracy"]), np.sum(perms[min_recall]['auc'] > true_min_recall["auc"]), np.sum(perms[accuracy]['recall_0'] > true_accuracy["recall_0"]), np.sum(perms[accuracy]['recall_1'] > true_accuracy["recall_1"]), np.sum(perms[accuracy]['recall_mean'] > true_accuracy["recall_mean"]), np.sum(perms[accuracy]['accuracy'] > true_accuracy["accuracy"]), np.sum(perms[accuracy]['auc'] > true_accuracy["auc"])] pvals = pd.DataFrame(pvals) pvals["pval"] /= float(nperms) pvals.to_csv(os.path.join(OUTPUT, output_permutations), index=False) return {}
def reducer_(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG import glob, mapreduce BASE = "/neurospin/brainomics/2013_adni/ADAS11-MCIc-CTL/rndperm" INPUT = BASE + "/%i/%s" OUTPUT = BASE + "/../results/rndperm" keys = ["0.001_0.3335_0.3335_0.333_-1", "0.001_0.5_0_0.5_-1", "0.001_0.5_0.5_0_-1", "0.001_1_0_0_-1"] for key in keys: #key = keys[0] paths_5cv_all = [INPUT % (perm, key) for perm in xrange(NFOLDS * NRNDPERMS)] idx_5cv_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS) cpt = 0 qc = dict() r2_perms = np.zeros(NRNDPERMS) corr_perms = np.zeros(NRNDPERMS) r_bar_perms = np.zeros(NRNDPERMS) fleiss_kappa_stat_perms = np.zeros(NRNDPERMS) dice_bar_perms = np.zeros(NRNDPERMS) for perm_i in xrange(len(idx_5cv_blocks)-1): paths_5cv = paths_5cv_all[idx_5cv_blocks[perm_i]:idx_5cv_blocks[perm_i+1]] for p in paths_5cv: if os.path.exists(p) and not(p in qc): if p in qc: qc[p] += 1 else: qc[p] = 1 cpt += 1 # values = [mapreduce.OutputCollector(p) for p in paths_5cv] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) r2 = r2_score(y_true, y_pred) corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1] betas = np.hstack([item["beta"] for item in values]).T # ## Compute beta similarity measures # # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0])]) print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) print np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5, rtol=0, atol=1e-02) # # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # # Paire-wise Dice coeficient beta_n0 = betas_t != 0 ij = [[i, j] for i in xrange(5) for j in xrange(i+1, 5)] #print [[idx[0], idx[1]] for idx in ij] dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\ (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :])) for idx in ij]) except: dice_bar = fleiss_kappa_stat = 0. # r2_perms[perm_i] = r2 corr_perms[perm_i] = corr r_bar_perms[perm_i] = r_bar fleiss_kappa_stat_perms[perm_i] = fleiss_kappa_stat dice_bar_perms[perm_i] = dice_bar # END PERMS print "save", key np.savez_compressed(OUTPUT+"/perms_"+key+".npz", r2=r2_perms, corr=corr_perms, r_bar=r_bar_perms, fleiss_kappa=fleiss_kappa_stat_perms, dice_bar=dice_bar_perms) # perms = dict() fig, axis = plt.subplots(len(keys), 4)#, sharex='col') for i, key in enumerate(keys): perms[key] = np.load(OUTPUT+"/perms_"+key+".npz") n, bins, patches = axis[i, 0].hist(perms[key]['r2'], 50, normed=1, histtype='stepfilled') axis[i, 0].set_title(key + "_r2") n, bins, patches = axis[i, 1].hist(perms[key]['r_bar'], 50, normed=1, histtype='stepfilled') axis[i, 1].set_title(key + "_r_bar") n, bins, patches = axis[i, 2].hist(perms[key]['fleiss_kappa'], 50, histtype='stepfilled') axis[i, 2].set_title(key + "_fleiss_kappa") n, bins, patches = axis[i, 3].hist(perms[key]['dice_bar'], 50)#, 50, normed=1, histtype='stepfilled') axis[i, 3].set_title(key + "_dice_bar") plt.show() l1l2tv, l1tv, l1l2, l1 = ["0.001_0.3335_0.3335_0.333_-1", "0.001_0.5_0_0.5_-1", "0.001_0.5_0.5_0_-1", "0.001_1_0_0_-1"] # Read true scores import pandas as pd true = pd.read_csv(os.path.join(BASE, "..", "ADAS11-MCIc-CTL.csv")) true = true[true.a == 0.001] true_l1l2tv = true[true.l1 == 0.3335].iloc[0] true_l1l2 = true[(true.l1 == 0.5) & (true.l2 == 0.5)].iloc[0] true_l1tv = true[(true.l1 == 0.5) & (true.tv == 0.5)].iloc[0] true_l1 = true[(true.l1 == 1.)].iloc[0] # pvals nperms = float(len(perms[l1]['r2'])) from collections import OrderedDict pvals = OrderedDict() pvals["cond"] = ['l1', 'l1tv', 'l1l2', 'l1l2tv'] * 4 + \ ['l1 vs l1tv'] * 4 + ['l1l2 vs l1l2tv'] * 4 pvals["stat"] = ['r2'] * 4 + ['r_bar'] * 4 + ['fleiss_kappa'] * 4 + ['dice_bar'] * 4 +\ ['r2', 'r_bar', 'fleiss_kappa', 'dice_bar'] * 2 pvals["pval"] = [ np.sum(perms[l1]['r2'] > true_l1["r2"]), np.sum(perms[l1tv]['r2'] > true_l1tv["r2"]), np.sum(perms[l1l2]['r2'] > true_l1l2["r2"]), np.sum(perms[l1l2tv]['r2'] > true_l1l2tv["r2"]), np.sum(perms[l1]['r_bar'] > true_l1["beta_r_bar"]), np.sum(perms[l1tv]['r_bar'] > true_l1tv["beta_r_bar"]), np.sum(perms[l1l2]['r_bar'] > true_l1l2["beta_r_bar"]), np.sum(perms[l1l2tv]['r_bar'] > true_l1l2tv["beta_r_bar"]), np.sum(perms[l1]['fleiss_kappa'] > true_l1["beta_fleiss_kappa"]), np.sum(perms[l1tv]['fleiss_kappa'] > true_l1tv["beta_fleiss_kappa"]), np.sum(perms[l1l2]['fleiss_kappa'] > true_l1l2["beta_fleiss_kappa"]), np.sum(perms[l1l2tv]['fleiss_kappa'] > true_l1l2tv["beta_fleiss_kappa"]), np.sum(perms[l1]['dice_bar'] > true_l1["beta_dice_bar"]), np.sum(perms[l1tv]['dice_bar'] > true_l1tv["beta_dice_bar"]), np.sum(perms[l1l2]['dice_bar'] > true_l1l2["beta_dice_bar"]), np.sum(perms[l1l2tv]['dice_bar'] > true_l1l2tv["beta_dice_bar"]), # l1 vs l1tv np.sum((perms[l1tv]['r2'] - perms[l1]['r2']) > (true_l1tv["r2"] - true_l1["r2"])), np.sum((perms[l1tv]['r_bar'] - perms[l1]['r_bar']) > (true_l1tv["beta_r_bar"] - true_l1["beta_r_bar"])), np.sum((perms[l1tv]['fleiss_kappa'] - perms[l1]['fleiss_kappa']) > (true_l1tv["beta_fleiss_kappa"] - true_l1["beta_fleiss_kappa"])), np.sum((perms[l1tv]['dice_bar'] - perms[l1]['dice_bar']) > (true_l1tv["beta_dice_bar"] - true_l1["beta_dice_bar"])), # l1l2 vs l1l2tv np.sum((perms[l1l2]['r2'] - perms[l1l2tv]['r2']) > (true_l1l2["r2"] - true_l1l2tv["r2"])), np.sum((perms[l1l2tv]['r_bar'] - perms[l1l2]['r_bar']) > (true_l1l2tv["beta_r_bar"] - true_l1l2["beta_r_bar"])), np.sum((perms[l1l2tv]['fleiss_kappa'] - perms[l1l2]['fleiss_kappa']) > (true_l1l2tv["beta_fleiss_kappa"] - true_l1l2["beta_fleiss_kappa"])), np.sum((perms[l1l2tv]['dice_bar'] - perms[l1l2]['dice_bar']) > (true_l1l2tv["beta_dice_bar"] - true_l1l2["beta_dice_bar"]))] pvals = pd.DataFrame(pvals) pvals["pval"] /= nperms pvals.to_csv(os.path.join(OUTPUT, "pvals_stats_permutations.csv"), index=False)
class FractionTaxaBarStack(Graph): """This is figure 3 of the paper""" short_name = 'fraction_taxa_barstack' bottom = 0.4 top = 0.95 left = 0.1 right = 0.95 formats = ('pdf', 'eps') def plot(self): # Make Frame # self.frame = OrderedDict((('%s - %s' % (p,f), getattr(p.fractions, f).rdp.phyla) for f in ('low', 'med', 'big') for p in self.parent.pools)) self.frame = pandas.DataFrame(self.frame) self.frame = self.frame.fillna(0) # Rename # new_names = { u"run001-pool01 - low": "2-step PCR low", u"run001-pool02 - low": "2-step PCR low", u"run001-pool03 - low": "2-step PCR low", u"run001-pool04 - low": "1-step PCR low", u"run002-pool01 - low": "New chem low", u"run001-pool01 - med": "2-step PCR med", u"run001-pool02 - med": "2-step PCR med", u"run001-pool03 - med": "2-step PCR med", u"run001-pool04 - med": "1-step PCR med", u"run002-pool01 - med": "New chem med", u"run001-pool01 - big": "2-step PCR high", u"run001-pool02 - big": "2-step PCR high", u"run001-pool03 - big": "2-step PCR high", u"run001-pool04 - big": "1-step PCR high", u"run002-pool01 - big": "New chem high", } self.frame.rename(columns=new_names, inplace=True) self.frame = self.frame.transpose() # Group low abundant into 'others' # low_abundance = self.frame.sum() < 30000 other_count = self.frame.loc[:, low_abundance].sum(axis=1) self.frame = self.frame.loc[:, ~low_abundance] self.frame['Others'] = other_count # Normalize # self.frame = self.frame.apply(lambda x: 100*x/x.sum(), axis=1) # Sort the table by sum # sums = self.frame.sum() sums.sort(ascending=False) self.frame = self.frame.reindex_axis(sums.keys(), axis=1) # Plot # fig = pyplot.figure() axes = self.frame.plot(kind='bar', stacked=True, color=cool_colors) fig = pyplot.gcf() # Other # axes.set_ylabel('Relative abundances in percent') axes.xaxis.grid(False) axes.yaxis.grid(False) axes.set_ylim([0,100]) # Put a legend below current axis axes.legend(loc='upper center', bbox_to_anchor=(0.5, -0.40), fancybox=True, shadow=True, ncol=5, prop={'size':10}) # Font size # axes.tick_params(axis='x', which='major', labelsize=11) # Save it # self.save_plot(fig, axes) self.frame.to_csv(self.csv_path) pyplot.close(fig)
engine='c', low_memory=True) for i in range(len(az_data["MSN"])): if az_data["Year"][i] > 2009 and az_data["MSN"][i] == "TETCB": az_new_msn.append(az_data["MSN"][i]) az_new_year.append(az_data["Year"][i]) az_new_data.append(az_data["Data"][i]) else: pass az_new["MSN"] = az_new_msn az_new["Year"] = az_new_year az_new["Data"] = az_new_data az_new = pd.DataFrame(az_new) az_new.to_csv("data/csv/state_data/az_new_data.csv", index=False, index_label=False, sep=',') for i in range(len(ca_data["MSN"])): if ca_data["Year"][i] > 2009 and ca_data["MSN"][i] == "TETCB": ca_new_msn.append(ca_data["MSN"][i]) ca_new_year.append(ca_data["Year"][i]) ca_new_data.append(ca_data["Data"][i]) else: pass ca_new["MSN"] = ca_new_msn ca_new["Year"] = ca_new_year ca_new["Data"] = ca_new_data ca_new = pd.DataFrame(ca_new) ca_new.to_csv("data/csv/state_data/ca_new_data.csv",
x_df.set_value(cur_index, 'mean disk space', sum(mean_disk_space)) else: x_df.set_value(cur_index, 'max disk space', 0.0) x_df.set_value(cur_index, 'mean disk space', 0.0) mean_disk_space = [usage_df['mean disk space'][ i]] if usage_df['mean disk space'][i] > 0 else [] cur_start = int(usage_df['start time'][i] / interval) cur_index = cur_index + 1 else: mean_disk_space = [usage_df['mean disk space'][ i]] if usage_df['mean disk space'][i] > 0 else [] cur_start = int(usage_df['start time'][i] / interval) cur_index = cur_index + 1 x_df.to_csv(out_file, index=False) # Get X Labels (Features) import pandas as pd IDs = pd.read_csv('./machine_id.csv') machine_IDs = IDs['machine ID'].tolist() output_file = './machine_label_X-500.csv' start_pos = [0 for i in range(500)] columns = ['timestamp', 'machine ID', 'max CPU usage', 'mean CPU usage', 'max disk I/O', 'mean disk I/O', 'max disk space', 'mean disk space', 'max memory usage', 'mean memory usage', 'max page cache', 'mean page cache', 'max MAI', 'mean MAI'] out_df = pd.DataFrame(columns=columns)
def main(): opts = argparse.ArgumentParser() opts.add_argument('--excelSheetIn', dest='excelSheetIn') opts.add_argument('--barcodeFasta', default=None, dest='barcodeFasta') opts.add_argument('--fxnTransformBarcode_i7', default="lambda barcin:barcin", dest='fxnTransformBarcode_i7') opts.add_argument('--fxnTransformBarcode_i5', default="lambda barcin:barcin", dest='fxnTransformBarcode_i5') opts.add_argument('--coreTemplateIn', dest='coreTemplateIn') opts.add_argument('--coreSheetOut', default=None, dest='coreSheetOut') opts.add_argument('--coreBuffer', default=None, dest='coreBuffer') opts.add_argument('--coreConc', default=None, dest='coreConc') opts.add_argument('--coreVol', default=None, dest='coreVol') opts.add_argument('--coreFraglen', default=None, dest='coreFraglen') opts.add_argument('--coreShortcode', default=None, dest='coreShortcode') opts.add_argument('--coreSpecies', default=None, dest='coreSpecies') opts.add_argument('--coreNotes', default=None, dest='coreNotes') opts.add_argument('--libKeyOut', dest='libKeyOut') opts.add_argument('--extraCols', default=None, dest='extraCols') opts.add_argument( '--fxnLibName', default='lambda r:"%s_%s"%(r.source_plate,r.source_well)', dest='fxnLibName') o = opts.parse_args() fxnLibName = eval(o.fxnLibName) fxnTransformBarcode_i5 = eval(o.fxnTransformBarcode_i5) fxnTransformBarcode_i7 = eval(o.fxnTransformBarcode_i7) # load the barcode sequences filInBarcs = open(o.barcodeFasta, 'r') l = filInBarcs.readline() # if l[0]=='>': assert l[0] == '>' mBcNameSeq_i5i7 = {} while len(l) > 0: bcname = l[1:].rstrip() bcseq = filInBarcs.readline().rstrip() l = filInBarcs.readline() assert bcname not in mBcNameSeq_i5i7, '%s present > once' % bcname mBcNameSeq_i5i7[bcname] = (fxnTransformBarcode_i5(bcseq), fxnTransformBarcode_i7(bcseq)) wbin = openpyxl.load_workbook(filename=o.excelSheetIn) sheet_src_plate = wbin.get_sheet_by_name('SOURCE PLATE') sheet_src_well = wbin.get_sheet_by_name('SOURCE WELL') sheet_src_barcp5 = wbin.get_sheet_by_name('P5 BARCODE') sheet_src_barcp7 = wbin.get_sheet_by_name('P7 BARCODE') # source plate: # make sure 1...12 from A6 to right for i in range(1, 12): sheetloc = ofsFrom('A6', right=i) obs = str(int(sheet_src_plate[sheetloc].value)) exp = str(i) assert obs == exp, 'SOURCE PLATE %s : expected %s but got %s' % ( sheetloc, exp, obs) # make sure A..H giong from A7 down for i in range(0, 8): sheetloc = ofsFrom('A7', down=i) obs = str(sheet_src_plate[sheetloc].value) exp = str(chr(ord('A') + i)) assert obs == exp, 'SOURCE PLATE %s : expected %s but got %s' % ( sheetloc, exp, obs) # well plate: # make sure 1...12 from A6 to right for i in range(1, 12): sheetloc = ofsFrom('A6', right=i) obs = str(int(sheet_src_well[sheetloc].value)) exp = str(i) assert obs == exp, 'SOURCE WELL %s : expected %s but got %s' % ( sheetloc, exp, obs) # make sure A..H giong from A7 down for i in range(0, 8): sheetloc = ofsFrom('A7', down=i) obs = str(sheet_src_well[sheetloc].value) exp = str(chr(ord('A') + i)) assert obs == exp, 'SOURCE WELL %s : expected %s but got %s' % ( sheetloc, exp, obs) # p7 barc: # make sure 1...12 from A7 to right for i in range(1, 12): sheetloc = ofsFrom('A7', right=i) obs = str(int(sheet_src_barcp7[sheetloc].value)) exp = str(i) assert obs == exp, 'P7 PRIMER %s : expected %s but got %s' % (sheetloc, exp, obs) # make sure A..H giong from A8 down for i in range(0, 8): sheetloc = ofsFrom('A8', down=i) obs = str(sheet_src_barcp7[sheetloc].value) exp = str(chr(ord('A') + i)) assert obs == exp, 'P7 PRIMER %s : expected %s but got %s' % (sheetloc, exp, obs) # p5 barc: # make sure 1...12 from A7 to right for i in range(1, 12): sheetloc = ofsFrom('A7', right=i) obs = str(int(sheet_src_barcp5[sheetloc].value)) exp = str(i) assert obs == exp, 'p5 PRIMER %s : expected %s but got %s' % (sheetloc, exp, obs) # make sure A..H giong from A8 down for i in range(0, 8): sheetloc = ofsFrom('A8', down=i) obs = str(sheet_src_barcp5[sheetloc].value) exp = str(chr(ord('A') + i)) assert obs == exp, 'p5 PRIMER %s : expected %s but got %s' % (sheetloc, exp, obs) # gather into DF df = OrderedDict() for col in [ 'well', 'source_plate', 'source_well', 'p7_barc_and_well', 'p5_barc_and_well', 'p7_barc', 'p5_barc', 'p7_barc_seq', 'p5_barc_seq' ]: df[col] = [] for j in range(0, 12): for i in range(0, 8): sheetloc1 = ofsFrom('B7', down=i, right=j) sheetloc2 = ofsFrom('B8', down=i, right=j) # fun fact- excel does rows/cols opposite of PCR plates well = ofsFrom('A1', down=j, right=i) srcplate = sheet_src_plate[sheetloc1].value srcwell = sheet_src_well[sheetloc1].value curp5 = sheet_src_barcp5[sheetloc2].value curp7 = sheet_src_barcp7[sheetloc2].value srcplate = '' if srcplate is None else str(srcplate).strip() srcwell = '' if srcwell is None else str(srcwell).strip() curp5 = '' if curp5 is None else str(curp5).strip() curp7 = '' if curp7 is None else str(curp7).strip() # srcplate=srcplate.replace('-','').replace('_','') srcplate = re.subn('[\'"$:\W@\n]', '', srcplate)[0] srcwell = re.subn('[\'"$:\W@\n]', '', srcwell)[0] if any([ len(srcplate) == 0, len(srcwell) == 0, len(curp5) == 0, len(curp7) == 0 ]): if not all([ len(srcplate) == 0, len(srcwell) == 0, len(curp5) == 0, len(curp7) == 0 ]): print( 'WARNING: well %s is not empty in all sheets: %s %s %s %s' % (well, srcplate, srcwell, curp5, curp7)) else: assert ':' in curp7, 'ERROR well %s p7 invalid barcode %s' % ( well, curp7) assert ':' in curp5, 'ERROR well %s p5 invalid barcode %s' % ( well, curp5) df['well'].append(well) df['source_plate'].append(srcplate) df['source_well'].append(srcwell) df['p7_barc_and_well'].append(curp7) df['p5_barc_and_well'].append(curp5) df['p7_barc'].append(curp7.split(':')[1]) df['p5_barc'].append(curp5.split(':')[1]) df['p7_barc_seq'].append( mBcNameSeq_i5i7[curp7.split(':')[1]][1]) df['p5_barc_seq'].append( mBcNameSeq_i5i7[curp5.split(':')[1]][0]) # gather extra cols if any mKvExtra = {} if o.extraCols is not None: for kv in o.extraCols.split(','): mKvExtra[kv.split(':')[0]] = kv.split(':')[1] df = pd.DataFrame(df) for k in mKvExtra: df[k] = mKvExtra[k] df['libname'] = '' for i in df.index: df.ix[i, 'libname'] = fxnLibName(df.ix[i]) # save to our own key df.to_csv(o.libKeyOut, sep='\t', index=False) # save into core template if o.coreSheetOut is not None: shutil.copyfile(o.coreTemplateIn, o.coreSheetOut) ct = openpyxl.load_workbook(filename=o.coreSheetOut) wb = ct.active # wb['A13']='Sample Name*' rowofs = 0 for _, r in df.iterrows(): shloco = ofsFrom('A18', down=rowofs, right=0) wb[shloco] = r.libname # shloco=ofsFrom('A18',down=rowofs,right=1) # wb[ shloco ] = o.coreBuffer shloco = ofsFrom('A18', down=rowofs, right=2) wb[shloco] = float(o.coreConc) shloco = ofsFrom('A18', down=rowofs, right=3) wb[shloco] = float(o.coreVol) # shloco=ofsFrom('A18',down=rowofs,right=4) # wb[ shloco ] = float(o.coreFraglen) shloco = ofsFrom('A18', down=rowofs, right=5) bcseq5 = mBcNameSeq_i5i7[r.p5_barc][0] bcseq7 = mBcNameSeq_i5i7[r.p7_barc][1] wb[shloco] = bcseq7 shloco = ofsFrom('A18', down=rowofs, right=4) wb[shloco] = bcseq5 # shloco=ofsFrom('A18',down=rowofs,right=6) # wb[ shloco ] = o.coreShortcode shloco = ofsFrom('A18', down=rowofs, right=5) wb[shloco] = o.coreSpecies shloco = ofsFrom('A18', down=rowofs, right=6) wb[shloco] = "DNA" if rowofs == 0: shloco = ofsFrom('A18', down=rowofs, right=7) wb[shloco] = o.coreNotes rowofs += 1 ct.save(filename=o.coreSheetOut)
problem = { "num_vars" : len(par_morris), "names" : par_morris, "groups" : None, "bounds" : [[0, lev-1]] * len(par_morris) } param_values = sample(problem, N=n_traj, grid_jump=grid_jump, num_levels=lev, sample4uniformity = 1000).astype(np.int64) #%%Plot #fig = plt.figure(figsize=(8, 6)) #sample_histograms(fig, param_values, problem, {'color': 'y'}) #plt.tight_layout() #plt.savefig(fig_out_path, dpi=100) #%%Create Dataframes traj_real = pd.DataFrame(OrderedDict([(par, pars[par][param_values[:, i]]) for i, par in enumerate(par_morris)])) traj_id = pd.DataFrame(OrderedDict([(par, param_values[:, i]) for i, par in enumerate(par_morris)])) fixed_pars = pd.DataFrame(fixed_pars, index=["fix"]) #Generate 2D linspace with for each simulation all levels n_aqtds_all = (np.linspace(0,1,num=lev)*aqtds_depth(traj_real["H_b"])[:, None]) n_aqtd_select = n_aqtds_all[np.arange(traj_id["N_aqt"].shape[0]), traj_id["N_aqt"].values].astype(np.int64) traj_real["N_aqt"] = n_aqtd_select #%%Save as csv traj_real.to_csv(traj_real_path) traj_id.to_csv(traj_id_path) fixed_pars.to_csv(fixed_pars_path)