def qc_sd(args): """ %prog sdtest input.matrix output.matrix run quality control on segregation distortions in each SNP. """ p = OptionParser(qc_sd.__doc__) p.add_option("-i", "--input", help=SUPPRESS_HELP) p.add_option("-o", "--output", help=SUPPRESS_HELP) p.add_option('--population', default='RIL', choices=('RIL', 'F2', 'BCFn'), help = "population type") p.add_option('--sig_cutoff', default = 1e-2, type='float', help = "set the chi square test cutoff. 0(less strigent) to 1(more strigent)") q = OptionGroup(p, "format options") p.add_option_group(q) q.add_option('--homo1', default="A", help='character for homozygous genotype') q.add_option("--homo2", default='B', help="character for alternative homozygous genotype") q.add_option('--hete', default='X', help='character for heterozygous genotype') q.add_option('--missing', default='-', help='character for missing value') opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inmap, outmap = args inputmatrix = opts.input or inmap outputmatrix = opts.output or outmap if opts.sig_cutoff >=1 or opts.sig_cutoff <= 0: eprint('the cutoff chi square test should be smaller than 1 and larger than 0') sys.exit(1) chr_order, chr_nums = getChunk(inputmatrix) map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True) Good_SNPs = [] for chrom in chr_order: print('{}...'.format(chrom)) chunk = chr_nums[chrom] df_chr_tmp = map_reader.get_chunk(chunk) df_chr_tmp_num = df_chr_tmp.replace([opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9]) ob0, ob2 = (df_chr_tmp_num==0).sum(axis=1), (df_chr_tmp_num==2).sum(axis=1) obsum = ob0 + ob2 exp0, exp2 = (obsum*0.75, obsum*0.25) if opts.population == 'BCFn' else (obsum*0.5, obsum*0.5) df_chi = pd.DataFrame(dict(zip(['ob0', 'ob2', 'exp0', 'exp2'], [ob0, ob2, exp0, exp2]))) min_cond = ((df_chi['ob0']>5) & (df_chi['ob2']>5)).values pval_cond = chisquare([df_chi['ob0'], df_chi['ob2']], [df_chi['exp0'], df_chi['exp2']]).pvalue >= opts.sig_cutoff good_snp = df_chr_tmp.loc[(min_cond & pval_cond), :] Good_SNPs.append(good_snp) df1 = pd.concat(Good_SNPs) before_snp_num = sum(chr_nums.values()) after_snp_num = df1.shape[0] pct = after_snp_num/float(before_snp_num)*100 print('{} SNP markers before quality control.'.format(before_snp_num)) print('{}({:.1f}%) markers left after the quality control.'.format(after_snp_num, pct)) df1.to_csv(outputmatrix, sep='\t', index=True)
def bin(args): """ %prog bin corrected.matrix output.matrix compress markers byy merging consecutive markers with same genotypes """ p = OptionParser(bin.__doc__) p.add_option("-i", "--input", help=SUPPRESS_HELP) p.add_option("-o", "--output", help=SUPPRESS_HELP) p.add_option('--diff_num', default=0, type='int', help='number of different genotypes between two consecutive markers less than or equal to this value will be merged. \ missing values will not be counted.') p.add_option('--missing', default='-', help='character for missing value in genotype matrix file') p.add_option("--logfile", default='GC.bin.log', help="specify the file saving running info") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inmap, outmap = args inputmatrix = opts.input or inmap outputmatrix = opts.output or outmap if Path(outputmatrix).exists(): eprint("ERROR: Filename collision. The future output file `{}` exists".format(outputmatrix)) sys.exit(1) chr_order, chr_nums = getChunk(inputmatrix) map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True) Good_SNPs = [] binning_info = [] for chrom in chr_order: print('{}...'.format(chrom)) chunk = chr_nums[chrom] df_chr_tmp = map_reader.get_chunk(chunk) if df_chr_tmp.shape[0] == 1: Good_SNPs.append(df_chr_tmp) else: represent_idx, block_idx, results = bin_markers(df_chr_tmp.loc[chrom], diff=opts.diff_num, missing_value=opts.missing) good_snp = df_chr_tmp.loc[(chrom, results), :] Good_SNPs.append(good_snp) if represent_idx: df_binning_info = pd.DataFrame(dict(zip(['chr', 'representative_marker', 'markers'], [chrom, represent_idx, block_idx]))) binning_info.append(df_binning_info) df1 = pd.concat(Good_SNPs) df1.to_csv(outputmatrix, sep='\t', index=True) before_snp_num = sum(chr_nums.values()) after_snp_num = df1.shape[0] pct = after_snp_num/float(before_snp_num)*100 print('{} SNP markers before compression.'.format(before_snp_num)) print('{}({:.1f}%) markers left after compression.'.format(after_snp_num, pct)) if binning_info: df2 = pd.concat(binning_info) df2.to_csv(opts.logfile, sep='\t', index=False) print('Check {} for binning details.'.format(opts.logfile))
def dispatch(self, globals): from difflib import get_close_matches meta = 'ACTION' if len(sys.argv) == 1: self.print_help() action = sys.argv[1] if not action in self.valid_actions: eprint("[error] %s not a valid %s\n" % (action, meta)) alt = get_close_matches(action, self.valid_actions) eprint(sys.stderr, "Did you mean one of these?\n\t%s\n" % (", ".join(alt))) self.print_help() globals[action](sys.argv[2:])
def mstmap2allmaps(args): """ %prog mstmap2allmaps mstmap_fn allmaps_fn convert mstmap results to the file format Allmaps required """ p = OptionParser(mstmap2allmaps.__doc__) p.add_option("-i", "--input", help=SUPPRESS_HELP) p.add_option("-o", "--output", help=SUPPRESS_HELP) p.add_option('--min_markers', default=10, type='int', help='set the cutoff of marker numbers in a linkage group') opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mst_in, allmp_out = args inputmstmap = opts.input or mst_in outputallmp = opts.output or allmp_out if Path(outputallmp).exists(): eprint('EROOR: Filename collision. The fugure output file `{}` exists'. format(outputallmp)) sys.exit(1) fout = open(outputallmp, 'w') fout.write('Scaffold_ID\tScaffold_Position\tLG\tGenetic_Position\n') fp = open(inputmstmap) for header, seq in read_block(fp, "group "): lg_name = header.split()[-1] seq = list(seq) seq_len = len(seq) - 5 if seq_len > opts.min_markers: for s in seq: if s.strip() == '' or s[0] == ';': continue marker, genetic_pos = s.split() scaffold, pos = '_'.join( marker.split('_')[:-1]), marker.split('_')[-1] fout.write('{}\t{}\t{}\t{}\n'.format(scaffold, pos, lg_name, genetic_pos)) print('markers in {} is less than {}, omit...'.format( lg_name, opts.min_markers)) fp.close() fout.close()
def dpp(args): ''' %prog training_data_dir label_fn model_results_dir Run dpp regression model ''' p = OptionParser(dpp.__doc__) p.add_option('--problem_type', default='classification',choices=('classification', 'regression'), help = 'specify your problem type') p.add_option('--tensorboard', default='infer', help = 'tensorboard dir name') p.add_option('--epoch', default=500, help = 'number of epoches. set to 500 for leaf couting problem') p.add_option('--split_ratio', default=0.2, help = 'the ratio of training dataset used for testing') p.add_option('--lr_n', default=1, type='int', help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times') p.set_slurm_opts(gpu=True) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) training_dir, label_fn, model_name, = args tb_dir_name = 'tensorboard_{}'.format(model_name) if opts.tensorboard == 'infer' else opts.tensorboard out_fns = fns(model_name, tb_dir=tb_dir_name, n=opts.lr_n) for i in range(opts.lr_n): try: os.mkdir(out_fns.model_name[i]) except FileExistsError: eprint("ERROR: Filename collision. The future output file `{}` exists".format(out_fns.model_name[i])) sys.exit(1) cmd = 'python -m schnablelab.CNN.dpp_%s %s %s %s %s %s %s %s'%\ (opts.problem_type, training_dir, label_fn, out_fns.model_name[i], out_fns.tb_dirs[i], opts.epoch, opts.split_ratio, out_fns.lrs[i]) SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i]) SlurmHeader += 'module load anaconda\nsource activate MCY\n' SlurmHeader += cmd f = open('%s.slurm'%out_fns.model_name[i], 'w') f.write(SlurmHeader) f.close() print('slurm file %s.slurm has been created, now you can sbatch your job file.'%out_fns.model_name[i])
def sortPos(args): """ %prog sortPos input.map output.sorted.map sort markers based on position """ p = OptionParser(sortPos.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inmap, outmap, = args if Path(outmap).exists(): eprint("ERROR: Filename collision. The future output file `{}` exists".format(outputmatrix)) sys.exit(1) df = pd.read_csv(inmap, delim_whitespace=True) idx_col = list(df.columns[0:2]) df.sort_values(idx_col).to_csv(outmap, sep='\t', index=False) print('Done! Check %s file.'%outmap)
def format(args): """ %prog format corrected.matrix convert corrected genotype matrix file to other formats(mstmap, joinmap, r/qtl) for the genetic mapping software. Example: `python -m schnablelab.imputation.GC format test.map --mstmap --mstmap_pop_type RIL2` will generate `test.mstmap` for MSTmap use. """ p = OptionParser(format.__doc__) p.add_option("-i", "--input", help=SUPPRESS_HELP) p.add_option("--mstmap", default=False, action="store_true", help='convert to MSTmap format') p.add_option("--rqtl", default=False, action="store_true", help='convert to R/qtl format') p.add_option("--joinmap", default=False, action="store_true", help='convert to JoinMap format') q = OptionGroup(p, "format options for input matrix file") p.add_option_group(q) q.add_option('--homo1', default="A", choices=('a', 'A'), help='character for homozygous genotype') q.add_option("--homo2", default='B', choices=('b', 'B'), help="character for alternative homozygous genotype") q.add_option('--hete', default='X', choices=('h', 'H', 'X'), help='character for heterozygous genotype') q.add_option('--missing', default='-', choices=('-', 'U'), help='character for missing value') r = OptionGroup(p, "parameters for MSTmap") p.add_option_group(r) r.add_option( '--mstmap_pop_type', help='Possible values are DH and RILd, where d is any natural number. \ For example, RIL6 means a RIL population at generation 6. \ You should use RIL2 for F2. Use DH for BC1, DH and Hap.') r.add_option( "--population_name", default='LinkageGroup', help= "ives a name for the mapping population. It can be any string of letters (a-z, A-Z) or digits (0-9)" ) r.add_option('--distance_function', default='kosambi', choices=('kosambi', 'haldane'), help="choose Kosambi's and Haldane's distance functions") r.add_option( '--cut_off_p_value', default=0.000001, help= 'specifies the threshold to be used for clustering the markers into LGs' ) r.add_option('--no_map_dist', default=15, help='check mstmap manual for details') r.add_option('--no_map_size', default=5, help='check mstmap manual for details') r.add_option( '--missing_threshold', default=0.4, help= 'any marker with more than this value will be removed completely without being mapped' ) r.add_option( '--estimation_before_clustering', default='no', choices=('yes', 'no'), help= 'if yes, MSTmap will try to estimate missing data before clustering the markers into linkage groups' ) r.add_option('--detect_bad_data', default='yes', choices=('yes', 'no'), help='if yes turn on the error detection feature in MSTmap') r.add_option('--objective_function', default='COUNT', choices=('COUNT', 'ML'), help='specifies the objective function') s = OptionGroup(p, "parameters for JoinMap and R/qtl") p.add_option_group(s) s.add_option( '--pop_type', default='RIL', choices=('RIL', 'F2'), help= 'specify mapping population type. Contact me if you need supports for other population types' ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) inmap, = args inputmatrix = opts.input or inmap if (not opts.rqtl) and (not opts.joinmap) and (not opts.mstmap): eprint("ERROR: add at least one output format option.") sys.exit(1) if opts.mstmap: if not opts.mstmap_pop_type: eprint("ERROR: please choose population type for mstmap format.") sys.exit(1) if not (opts.mstmap_pop_type.startswith('RIL') or opts.mstmap_pop_type == 'DH'): eprint('ERROR: only RILd and DH supported in MSTmap') sys.exit(1) opf = inputmatrix.rsplit(".", 1)[0] + '.mstmap' # output file if Path(opf).exists(): eprint( "ERROR: Filename collision. The future output file `{}` exists" .format(opf)) sys.exit(1) df = pd.read_csv(inputmatrix, delim_whitespace=True) cols = list(df.columns[2:]) cols.insert(0, 'locus_name') df['locus_name'] = df.iloc[:, 0].astype( 'str') + '_' + df.iloc[:, 1].astype('str') df = df[cols] print(df.head()) snp_num, sm_num = df.shape[0], df.shape[1] - 1 f1 = open(opf, 'w') f1.write(mst_header.format(opts.mstmap_pop_type, opts.population_name, opts.distance_function, opts.cut_off_p_value, \ opts.no_map_dist, opts.no_map_size, opts.missing_threshold, opts.estimation_before_clustering, opts.detect_bad_data, \ opts.objective_function, snp_num, sm_num)) f1.close() df.to_csv(opf, sep='\t', index=False, mode='a') print('Done, check file {}!'.format(opf)) if opts.joinmap: opf = inputmatrix.rsplit(".", 1)[0] + '.joinmap.xlsx' # output file if Path(opf).exists(): eprint( "ERROR: Filename collision. The future output file `{}` exists" .format(opf)) sys.exit(1) df = pd.read_csv(inputmatrix, delim_whitespace=True) need_reps, reps = [], [] if opts.homo1 != 'a': need_reps.append(opts.homo1) reps.append('a') if opts.homo2 != 'b': need_reps.append(opts.homo2) reps.append('b') if opts.hete != 'h': need_reps.append(opts.hete) reps.append('h') if opts.missing != '-': need_reps.append(opts.missing) reps.append('-') if need_reps: df = df.replace(need_reps, reps) cols = list(df.columns[2:]) cols.insert(0, 'Classification') cols.insert(0, 'locus_name') df['locus_name'] = df.iloc[:, 0].astype( 'str') + '_' + df.iloc[:, 1].astype('str') df['Classification'] = '(a,h,b)' df = df[cols] df.to_excel(opf) print( 'Done! Now you can load the genotype data into the JoinMap project from the MS-Excel spreadsheet {} to a dataset node.' .format(opf)) if opts.rqtl: opf = inputmatrix.rsplit(".", 1)[0] + '.rqtl.csv' # output file if Path(opf).exists(): eprint( "ERROR: Filename collision. The future output file `{}` exists" .format(opf)) sys.exit(1) df = pd.read_csv(inputmatrix, delim_whitespace=True) need_reps, reps = [], [] if opts.homo1 != 'A': need_reps.append(opts.homo1) reps.append('A') if opts.homo2 != 'B': need_reps.append(opts.homo2) reps.append('B') if opts.hete != 'H': need_reps.append(opts.hete) reps.append('H') if opts.missing != '-': need_reps.append(opts.missing) reps.append('-') if need_reps: df = df.replace(need_reps, reps) cols = list(df.columns[2:]) cols.insert(0, 'id') df['id'] = df.iloc[:, 0].astype('str') + '_' + df.iloc[:, 1].astype('str') df = df[cols] df.loc[-1] = 1 df.index = df.index + 1 df = df.sort_index() df.iloc[0, 0] = np.nan df.to_csv(opf, index=False, na_rep='') print('Done, check file {}!'.format(opf))
def qc_missing(args): """ %prog filtermissing input.matrix output.matrix run quality control of the missing genotypes in the input.matrix before starting the correction. """ p = OptionParser(qc_missing.__doc__) p.add_option("-i", "--input", help=SUPPRESS_HELP) p.add_option("-o", "--output", help=SUPPRESS_HELP) p.add_option( '--cutoff_snp', default=0.5, type='float', help="SNP with missing rate higher than this value will be removed") p.add_option( '--rm_bad_samples', default=False, action="store_true", help= 'remove bad samples after controlling the SNPs with high missing rate') p.add_option( '--cutoff_sample', type='float', help= "sample missing rate higher than this value will be removed after controlling the SNP missing rate" ) q = OptionGroup(p, "format options") p.add_option_group(q) q.add_option('--homo1', default="A", help='character for homozygous genotype') q.add_option("--homo2", default='B', help="character for alternative homozygous genotype") q.add_option('--hete', default='X', help='character for heterozygous genotype') q.add_option('--missing', default='-', help='character for missing value') opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) if opts.rm_bad_samples and not opts.cutoff_sample: eprint( 'missing value cutoff for --cutoff_sample option must be specified when --rm_bad_samples added.' ) sys.exit(1) inmap, outmap = args inputmatrix = opts.input or inmap outputmatrix = opts.output or outmap chr_order, chr_nums = getChunk(inputmatrix) map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True) Good_SNPs = [] for chrom in chr_order: print('{}...'.format(chrom)) chunk = chr_nums[chrom] df_chr_tmp = map_reader.get_chunk(chunk) df_chr_tmp_num = df_chr_tmp.replace( [opts.homo1, opts.homo2, opts.hete, opts.missing], [0, 2, 1, 9]) sample_num = df_chr_tmp_num.shape[1] good_rates = df_chr_tmp_num.apply(lambda x: (x == 9).sum() / sample_num, axis=1) <= opts.cutoff_snp good_snp = df_chr_tmp.loc[good_rates, :] Good_SNPs.append(good_snp) df1 = pd.concat(Good_SNPs) before_snp_num = sum(chr_nums.values()) after_snp_num, before_sm_num = df1.shape pct = after_snp_num / float(before_snp_num) * 100 print('{} SNP markers before quality control.'.format(before_snp_num)) print('{}({:.1f}%) markers left after the quality control.'.format( after_snp_num, pct)) if opts.rm_bad_samples: print('start quality control on samples') good_samples = df1.apply(lambda x: (x == opts.missing).sum() / after_snp_num, axis=0) <= opts.cutoff_sample df2 = df1.loc[:, good_samples] after_sm_num = df2.shape[1] pct_sm = after_sm_num / float(before_sm_num) * 100 print('{} samples before quality control.'.format(before_sm_num)) print('{}({:.1f}%) markers left after the quality control.'.format( after_sm_num, pct_sm)) df2.to_csv(outputmatrix, sep='\t', index=True) else: df1.to_csv(outputmatrix, sep='\t', index=True)
def correct(args): """ %prog correct config.txt input.matrix Correct wrong genotype calls and impute missing values in biparental populations """ p = OptionParser(correct.__doc__) p.add_option("-c", "--configfile", help=SUPPRESS_HELP) p.add_option("-m", "--matrixfile", help=SUPPRESS_HELP) p.add_option('--itertimes', default=7, type='int', help='maximum correction times to reach the stablized status') q = OptionGroup(p, "output options") p.add_option_group(q) q.add_option('--opp', default="'infer'", help='specify the prefix of the output file names') q.add_option("--logfile", default='GC.correct.log', help="specify the file saving running info") q.add_option( '--debug', default=False, action="store_true", help= 'trun on the debug mode that will generate a tmp file containing both original and corrected genotypes for debug use' ) p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) configfile, mapfile = args inputmatrix = opts.matrixfile or mapfile inputconfig = opts.configfile or configfile opf = inputmatrix.rsplit( ".", 1)[0] + '.corrected.map' if opts.opp == "'infer'" else '{}.map'.format( opts.opp) # output file if Path(opf).exists(): eprint("ERROR: Filename collision. The future output file `{}` exists". format(opf)) sys.exit(1) cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 logging.basicConfig(filename=opts.logfile, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s") cargs = ParseConfig(inputconfig) if cargs.win_size % 2 == 0: eprint("ERROR: The slding window value cannot be even") sys.exit(1) logging.debug("Parameters in config file: {0}".format(cargs.__dict__)) chr_order, chr_nums = getChunk(inputmatrix) map_reader = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1], iterator=True) tmp_chr_list = [] for chrom in chr_order: logging.debug('{}...'.format(chrom)) print('{}...'.format(chrom)) chunk = chr_nums[chrom] df_chr_tmp = map_reader.get_chunk(chunk) marker_num, sample_num = df_chr_tmp.shape logging.debug('{} contains {} markers and {} samples.'.format( chrom, marker_num, sample_num)) tmp_sm_list = [] for sm in df_chr_tmp: logging.debug('Start correcting {}...'.format(sm)) orig_seq = df_chr_tmp[sm] orig_idx = orig_seq.index seq_no_idx = orig_seq.reset_index(drop=True) seq_no_idx_num = seq_no_idx.replace( [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss], [0, 2, 1, 9]) if seq_no_idx_num.shape[0] <= cargs.win_size: logging.debug( 'number of markers smaller than the window size, omit...') final_seq_no_idx = seq_no_idx else: logging.debug('correction round 1...') correct_obj = CorrectOO(cargs, seq_no_idx_num) corrected_n = get_corrected_num(seq_no_idx_num, correct_obj.corrected) round_n = 2 while round_n <= opts.itertimes: logging.debug('correction round %s...' % round_n) corrected_obj = CorrectOO(cargs, correct_obj.corrected) corrected_n_new = get_corrected_num( seq_no_idx_num, corrected_obj.corrected) round_n += 1 if (corrected_n_new - corrected_n) / float(corrected_n + 0.01) <= 0.01: break else: corrected_n = corrected_n_new final_seq_no_idx = corrected_obj.corrected.replace( [0, 2, 1, 9], [cargs.gt_a, cargs.gt_b, cargs.gt_h, cargs.gt_miss]) final_seq_no_idx.index = orig_idx final_seq = final_seq_no_idx tmp_sm_list.append(final_seq) df_sm_tmp = pd.concat(tmp_sm_list, axis=1) tmp_chr_list.append(df_sm_tmp) df_corrected = pd.concat(tmp_chr_list) df_corrected.to_csv(opf, sep='\t', index=True) if opts.debug: logging.debug('generating the tmp file for debug use...') df_uncorrected = pd.read_csv(inputmatrix, delim_whitespace=True, index_col=[0, 1]) df_debug = df_corrected.where(df_corrected == df_uncorrected, other=df_corrected + '(' + df_uncorrected + ')') df_debug.to_csv(opf + '.debug', sep='\t', index=True) print('Done!')
def vcf2map(args): """ %prog vcf2map input.vcf output.matrix convert vcf format to genotype matrix format """ p = OptionParser(vcf2map.__doc__) p.add_option("-i", "--input", help=SUPPRESS_HELP) p.add_option("-o", "--output", help=SUPPRESS_HELP) p.add_option('--homo1', default="A", help='character for homozygous genotype') p.add_option("--homo2", default='B', help="character for alternative homozygous genotype") p.add_option('--hete', default='X', help='character for heterozygous genotype') p.add_option('--missing', default='-', help='character for missing value') p.add_option("--logfile", default='GC.vcf2map.info', help="specify the log file") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) invcf, outmap = args inputvcf = opts.input or invcf outputmatrix = opts.output or outmap if Path(outputmatrix).exists(): eprint("ERROR: Filename collision. The future output file `{}` exists". format(outputmatrix)) sys.exit(1) logging.basicConfig(filename=opts.logfile, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s") right_gt = { '0|0': opts.homo1, '0/0': opts.homo1, '0|1': opts.hete, '1|0': opts.hete, '0/1': opts.hete, '1/0': opts.hete, '1|1': opts.homo2, '1/1': opts.homo2, '.|.': opts.missing, './.': opts.missing, '.': opts.missing } useless_cols = ['ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] index_cols = ['#CHROM', 'POS'] vcffile = open(inputvcf) n = 0 for i in vcffile: if i.startswith('##'): n += 1 else: break vcffile.close() chr_order, chr_nums = getChunk(inputvcf, ignore=n + 1) vcf_reader = pd.read_csv(inputvcf, header=n, delim_whitespace=True, usecols=lambda x: x not in useless_cols, iterator=True) tmp_chr_list = [] for chrom in chr_order: logging.debug('{}...'.format(chrom)) print('{}...'.format(chrom)) chunk = chr_nums[chrom] df_chr_tmp = vcf_reader.get_chunk(chunk) df_chr_tmp = df_chr_tmp.set_index(index_cols) df_chr_tmp = df_chr_tmp.applymap(lambda x: x.split(':')[0]) df_chr_tmp = df_chr_tmp.applymap(lambda x: right_gt[x] if x in right_gt else np.nan) df_chr_tmp.dropna(inplace=True) tmp_chr_list.append(df_chr_tmp) df1 = pd.concat(tmp_chr_list) df1.to_csv(outputmatrix, sep='\t', index=True) vcffile.close() print('Done!')