def main(): usage = """ %prog [options] [inputFileGlob [outputFile]] """ parser = OptionParser(usage=usage, version="%prog 1.0") parser.add_option("-w", "--windowsize", dest="windowsize", type="int", default=None, help="window size to write to different files") parser.add_option("-o", "--outputdir", dest="outputdir", type="str", default=None, help="output dir") (options, args) = parser.parse_args() file_name_list = args assert options.windowsize is not None assert options.outputdir is not None if not os.path.exists(options.outputdir): os.makedirs(options.outputdir) # full_file_name_list = [l.strip() for l in file_name_list] meta_data, populations, regions = simons_meta_data.get_meta_data() # window_iter = genome_window_iter(*file_name_list, window_size=options.windowsize, chunk_size=options.chunksize) window_iter = genome_window_iter(*file_name_list, window_size=options.windowsize) for window in window_iter: # names, starts, ends, _ = list(zip(*window)) names, starts, ends, seqs = list(zip(*window)) assert names[1:] == names[:-1] assert starts[1:] == starts[:-1] assert ends[1:] == ends[:-1] outfile = os.path.join( options.outputdir, "{}-{:09d}-{:09d}.fa".format(names[0], starts[0], ends[0])) with open(outfile, 'w') as f: for (name, start, end, seq), file_name in zip(window, file_name_list): print(">{}\n{}\n".format(file_base_name(file_name), seq), file=f)
sys.path.insert(0, script_dir + '/../notebooks') import analysis_globals parser = argparse.ArgumentParser() parser.add_argument("--dist-dir", dest="dist_dir", type=Path) parser.add_argument("--meta-data-dir", dest="meta_data_dir", type=Path) parser.add_argument("--out-file", dest="out_file", type=Path) parser.add_argument("--dist-twice-out-file", dest="dist_twice_out_file", type=Path) parser.add_argument("--include-ust-ishim", dest="include_ust_ishim", action='store_true', default=False) # parser.add_argument("--result-dir", dest="result_dir", type=Path) # parser.add_argument("--result-file-prefix", dest="result_file_prefix", type=str, default='dist_data') args = parser.parse_args() # easy loading of meta data in a consistent manner across code individuals, populations, regions = simons_meta_data.get_meta_data( meta_data_dir=args.meta_data_dir, include_ust_ishim=args.include_ust_ishim) def optimize_data_frame(df, down_int='integer'): # down_int can be 'unsigned' converted_df = pandas.DataFrame() floats_optim = (df .select_dtypes(include=['float']) .apply(pandas.to_numeric,downcast='float') ) converted_df[floats_optim.columns] = floats_optim ints_optim = (df
from pandas import DataFrame, Series import simons_meta_data script_dir = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, script_dir + '/../notebooks') import analysis_globals parser = argparse.ArgumentParser() parser.add_argument("dist_file", type=Path) parser.add_argument("dist_twice_file", type=Path) args = parser.parse_args() dist_data = pandas.read_hdf(args.dist_file) individuals, populations, regions = simons_meta_data.get_meta_data( meta_data_dir=analysis_globals.meta_data_dir) # dict for swapping columns swap_dict = dict() for colname in dist_data.columns.values: if colname.endswith('_1'): swap_dict[colname] = colname[:-2] + '_2' if colname.endswith('_2'): swap_dict[colname] = colname[:-2] + '_1' cols = ['start', 'end', 'indiv_1', 'indiv_2', 'dist'] dist_data_twice = (pandas.concat([ dist_data[cols], dist_data[cols].rename(columns=swap_dict) ]).sort_values(['indiv_1', 'start']).reset_index(drop=True))
import simons_meta_data individuals, populations, regions = simons_meta_data.get_meta_data() f = open('samples.ind', 'w') print('Chimp', file=f) for indiv in individuals: chromotype = individuals[indiv]['Genetic sex assignment'] sex = chromotype == 'XY' and 'M' or 'F' pop = individuals[indiv]['Population ID'] print(indiv, sex, pop, file=f)
import gc import simons_meta_data from hg19_chrom_sizes import hg19_chrom_sizes as chromosome_lengths parser = argparse.ArgumentParser() parser.add_argument("--dist-dir", dest="dist_dir", type=Path) parser.add_argument("--meta-data-dir", dest="meta_data_dir", type=Path) parser.add_argument("--out-file", dest="out_file", type=Path) # parser.add_argument("--result-dir", dest="result_dir", type=Path) # parser.add_argument("--result-file-prefix", dest="result_file_prefix", type=str, default='dist_data') args = parser.parse_args() # easy loading of meta data in a consistent manner across code individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=args.meta_data_dir) def optimize_data_frame(df, down_int='integer'): # down_int can be 'unsigned' converted_df = pandas.DataFrame() floats_optim = (df .select_dtypes(include=['float']) .apply(pandas.to_numeric,downcast='float') ) converted_df[floats_optim.columns] = floats_optim ints_optim = (df .select_dtypes(include=['int']) .apply(pandas.to_numeric,downcast=down_int)