import gzip def custom_db(db_gz, gt_23andMe_gz, n_snps=100000, out_file_gz='out_custom_db.txt.gz'): ''' +-----------+ | custom_db | +-----------+ Script used to pull out SNPs present in database and 23andMe file, sort by F_ST, and write top n_snps to gzipped out_file Package dependencies: pandas commanderline ''' print('Reading database...') db=pd.read_csv(db_gz, compression='gzip', sep=' ') print('Reading 23andMe file...') gt=pd.read_csv(gt_23andMe_gz, compression='gzip', usecols=[1,2], sep='\t', comment='#', header=None) gt.columns=['chr','pos'] print('Merging on chr&pos...') m=pd.merge(db, gt, on=['chr','pos']) print('Sorting by F_ST...') m=m.sort_values(by='fst', ascending=False) print('Filtering...') m_out=m.iloc[:n_snps, :(len(m.columns)-1)] print('Sorting by chr & pos, and writing to file...') with gzip.open(out_file_gz, 'wt') as f_out: m_out.sort_values(by=['chr','pos']).to_csv(f_out, sep=' ', index=False) cl.commander_line(custom_db) if __name__ == '__main__' else None
if (red == 'avg'): avg = True raise Exception( 'Average used, but its output is not always consistent - especially for diag!' ) elif (red == 'sum'): avg = False else: raise Exception('Error: Unknown argument: ' + red) # Output is done step-by-step # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg) if generate_img: analysis.generate_img(out_fname + cnst.const['img_out_ext']) flat.print_log_msg('Done') def print_opt_arg_error(): print('For help use --help') return (2) # if __name__ == '__main__': # main() if __name__ == '__main__': cl.commander_line((pipeline_lean, pipeline, multiple_pipelines))
commanderline ''' results=pd.DataFrame() with open(file_list, 'rt') as f_in: for i,f in enumerate(f_in): f=f.strip() d1=pd.read_csv(f, header=None, sep=' ') d1['super']=d1[0].apply(lambda el: ancestry_translation[el]) d2=d1.groupby('super').sum() d2.columns=[i] results=pd.concat([results, d2], axis=1) results=results.dropna(subset=[0]) bootstrap_replicate_count=results.iloc[:,1:].apply(lambda el: el>=negligible_threshold).T.sum()/(len(results.columns)-1) filt=bootstrap_replicate_count.apply(lambda el: el>=bootstrap_confidence) interm_results=results[filt][0] ambig=1-interm_results.sum() final_results=pd.concat([interm_results, pd.Series({'AMBIGUOUS':ambig})]) print(final_results) cl.commander_line(validate_bootstrap) if __name__ == '__main__' else None
def calc_fst(ref_gz, out_gz='out_db_fst.txt.gz'): ''' +----------+ | calc_fst | +----------+ Helper script to calculate F_ST for Ancestry-format reference files. Output is the input file with an additional column named 'fst' (SNPs with NaN for F_ST are removed) Details about Ancestry: https://bitbucket.org/joepickrell/ancestry Package dependencies: pandas commanderline ''' print('Reading Ancestry-format reference file...') d = pd.read_csv(ref_gz, sep=' ') # Drop null columns, e.g., last column followed by a column delimiter will result in a null column d = d.dropna(axis=1, how='all') print('Calculating F_ST...') d['fst'] = d.iloc[:, 6:].apply(fst, axis=1) # Drop SNPs without F_ST d = d.dropna(subset=['fst']) print('Writing output file...') with gzip.open(out_gz, 'wt') as f_out: d.to_csv(f_out, sep=' ', index=False) cl.commander_line(calc_fst) if __name__ == '__main__' else None
with open(file_list, 'rt') as f_in: for i, f in enumerate(f_in): f = f.strip() d1 = pd.read_csv(f, header=None, sep=' ') d1['super'] = d1[0].apply(lambda el: ancestry_translation[el]) d2 = d1.groupby('super').sum() d2.columns = [i] results = pd.concat([results, d2], axis=1) results = results.dropna(subset=[0]) bootstrap_replicate_count = results.iloc[:, 1:].apply( lambda el: el >= negligible_threshold).T.sum() / ( len(results.columns) - 1) filt = bootstrap_replicate_count.apply( lambda el: el >= bootstrap_confidence) interm_results = results[filt][0] ambig = 1 - interm_results.sum() final_results = pd.concat( [interm_results, pd.Series({'AMBIGUOUS': ambig})]) print(final_results) cl.commander_line(validate_bootstrap) if __name__ == '__main__' else None
def calc_fst(ref_gz, out_gz='out_db_fst.txt.gz'): ''' +----------+ | calc_fst | +----------+ Helper script to calculate F_ST for Ancestry-format reference files. Output is the input file with an additional column named 'fst' (SNPs with NaN for F_ST are removed) Details about Ancestry: https://bitbucket.org/joepickrell/ancestry Package dependencies: pandas commanderline ''' print('Reading Ancestry-format reference file...') d=pd.read_csv(ref_gz, sep=' ') # Drop null columns, e.g., last column followed by a column delimiter will result in a null column d=d.dropna(axis=1,how='all') print('Calculating F_ST...') d['fst']=d.iloc[:,6:].apply(fst, axis=1) # Drop SNPs without F_ST d=d.dropna(subset=['fst']) print('Writing output file...') with gzip.open(out_gz, 'wt') as f_out: d.to_csv(f_out, sep=' ', index=False) cl.commander_line(calc_fst) if __name__ == '__main__' else None
Script used to pull out SNPs present in database and 23andMe file, sort by F_ST, and write top n_snps to gzipped out_file Package dependencies: pandas commanderline ''' print('Reading database...') db = pd.read_csv(db_gz, compression='gzip', sep=' ') print('Reading 23andMe file...') gt = pd.read_csv(gt_23andMe_gz, compression='gzip', usecols=[1, 2], sep='\t', comment='#', header=None) gt.columns = ['chr', 'pos'] print('Merging on chr&pos...') m = pd.merge(db, gt, on=['chr', 'pos']) print('Sorting by F_ST...') m = m.sort_values(by='fst', ascending=False) print('Filtering...') m_out = m.iloc[:n_snps, :(len(m.columns) - 1)] print('Sorting by chr & pos, and writing to file...') with gzip.open(out_file_gz, 'wt') as f_out: m_out.sort_values(by=['chr', 'pos']).to_csv(f_out, sep=' ', index=False) cl.commander_line(custom_db) if __name__ == '__main__' else None
print_metric(metric1) flat.print_log_msg('Breakpoint 2: ' + repr(breakpoint2)) flat.print_log_msg('Metric 2:') print_metric(metric2) def apply_metric(chr_name, begin, end, input_config, loci): metric_out = metric.Metric(chr_name, input_config, loci, begin, end) out = metric_out.calc_metric() return out def print_metric(metric_out): flat.print_log_msg('Sum: ' + repr(metric_out['sum'])) flat.print_log_msg('N (w/ zero\'s): ' + repr(metric_out['N_zero'])) flat.print_log_msg('Metric: ' + repr(metric_out['sum'] / metric_out['N_zero'])) def print_opt_arg_error(): print('For help use --help') return (2) # if __name__ == '__main__': # main() if __name__ == '__main__': cl.commander_line((pipeline))
dist = -0.1 for key in loaded: # print() # print(key, loaded[key]) try: if 'loci' in loaded[key]: dist -= 0.05 breakpoint_list_y = [dist for a in loaded[key]['loci']] pt.scatter(np.array(loaded[key]['loci']), np.array(breakpoint_list_y), c=next(colors), label=key, marker=next(markers)) except TypeError as e: print(e) pt.legend() pt.show() def plot_gwas_w_default_paths(chr_name, input_pickle_fname): plot_gwas( cnst.const['gwas_plots']['height']['root'] + cnst.const['gwas_plots']['height']['file_prefix'] + chr_name + cnst.const['gwas_plots']['height']['file_suffix'], input_pickle_fname) if __name__ == '__main__': cl.commander_line((plot_gwas_w_default_paths, plot_gwas))
# input_config = cnst.const['orig_data_'+dataset] input_config = cnst.return_conf(dataset_path) partitions = flat.read_partitions(name, input_config) with open(input_pickle_fname, 'rb') as f_in: loaded = pickle.load(f_in) # print(loaded) loci = loaded[subset]['loci'] first = partitions[0][0] last = partitions[len(partitions) - 1][1] # print(loci) print('chr', '\t', 'start', '\t', 'stop') print(name, '\t', first, '\t', loci[0]) for i in range(0, len(loci) - 1): print(name, '\t', loci[i], '\t', loci[i + 1]) print(name, '\t', loci[len(loci) - 1], '\t', last + 1) if __name__ == '__main__': cl.commander_line((chr_bpoints_to_bed, ), print_argv_to_output=False, print_done=False)
if (red == 'avg'): avg = True raise Exception( 'Average used, but its output is not always consistent - especially for diag!' ) elif (red == 'sum'): avg = False else: raise Exception('Error: Unknown argument: ' + red) # Output is done step-by-step # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg) if generate_img: analysis.generate_img(out_fname + cnst.const['img_out_ext']) flat.print_log_msg('Done') def print_opt_arg_error(): print('For help use --help') return (2) # if __name__ == '__main__': # main() if __name__ == '__main__': cl.commander_line((pipeline_lean, ))