def geno2geno(args): """ perform pair-wise FST by population """ genoparser = tabparser.GenotypeLineParser(args) sample_header = genoparser.get_sample_header() filename = args.outfile with open(filename + '.geno.txt', 'wt') as outfile, open(filename + '.pos.txt', 'wt') as outpos: outfile.write(sample_header) outfile.write('\n') outpos.write(genoparser.get_position_header()) outpos.write('\n') c = 0 for posline, genoline in genoparser.parse_raw_lines(): # split posline tokens = posline.split() if (tokens[0], tokens[1]) in genoparser.include_positions: outfile.write(genoline) outpos.write(posline) c += 1 cerr('I: writing %d positions' % c)
def prepare_stratified_samples(haplotypes, group_keys, k_fold, haplotype_func=None): """ check the suitability of sample sets and modify haplotypes and group_keys properly """ groups = [] for group_key, count in zip(*np.unique(group_keys, return_counts=True)): # we make sure that every group has at least 2 * k_fold member if count < k_fold * 2: groups.append((group_key, math.ceil(k_fold * 2 / count))) if len(groups) == 0: # nothing to modify return (haplotypes, group_keys) cerr('[I - prepare_stratified_sample() replicated group: %s]' % ' '.join(x[0] for x in groups)) #import IPython; IPython.embed() new_haplotypes = [haplotypes] new_group_keys = [group_keys] for group_key, m_factor in groups: indexes = np.where(group_keys == group_key) for i in range(m_factor): new_haplotypes.append(haplotypes[indexes]) new_group_keys.append(group_keys[indexes]) haplotypes = np.concatenate(new_haplotypes, axis=0) group_keys = np.concatenate(new_group_keys, axis=0) return (haplotypes, group_keys)
def parse_snps(self): """ this is generator that yield (legends, genotype) """ with open(self.genofile) as genofile, open(self.legendfile) as legendfile: # sanity check for sample number in genotype file header = next(genofile).strip() samples = header.split() if len(samples) != self.no_of_samples: cerr('ERR: no of sample in infile does not match the sample file') next(legendfile) gene = next(gene_iter) genotype = [] # generate genotype_line for (idx, line) in enumerate(zip(genofile, legendfile)): geno_line, legend_line = line tokens = geno_line.strip().split() legends = legend_line.replace('"', '').strip().split() seqid = legends[0] pos = int(legends[1]) if len(tokens) != self.no_of_samples: cerr('ERR: line %d - no of sample in infile does not match the sample file' % (idx + 1)) g = self.translate(tokens) yield( (legends, g))
def dist2clonalqc(args): # read distance matrix df = pd.read_csv(args.infile, sep='\t') samples = df.columns D = df.values # read quality file or pickled ralt/nalt file if args.datafile: nalt_args = SimpleNamespace(infile=args.datafile, fmt=args.fmt, n=-1) nalt_parser = naltparser.NAltLineParser(nalt_args, with_group=False, with_position=False) region = nalt_parser.parse_whole() qual = np.count_nonzero(region.M == -1, axis=0) else: cexit('ERR: other input file has not been defined') clonal_samples = clonal_index(D, qual.max() - qual, samples, args.threshold) cerr('[I - removing %d clonal samples]' % len(clonal_samples)) if args.outfile: np.savetxt(args.outfile, samples[clonal_samples], fmt='%s')
def prune_2(genotypes, positions, threshold=0.5, score=None): """ prune by r^2 except on CDS, only CDS within the same segment/region will be pruned """ if score == None: # we use MAC as default score score = np.min( genotypes.count_alleles(), axis=1) N = len(genotypes) if N != len(score) or N != len(positions): cexit('E: length of genotypes != length of score nor positions!') index = arrange_index( score ) compress_index = np.ones( len(index), dtype=np.int8 ) # calculate r^2 r_2 = calculate_r_2( genotypes ) # walk through index cerr('I: scanning r^2 matrix') for i in range(N): if not compress_index[i]: continue for _, j in scoring_index[i+1:]: if r_2[i,j] > threshold: # check if this is a CDS region if positions[j][4] and positions[j][4] != positions[i][4]: continue compress_index[j] = 0 return compress_index
def prune_1(genotypes, threshold=0.5, score=None): """ prune by r^2 with score as priority, returning indexing array """ N = len(genotypes) if score == None: # we use MAC as default score score = np.min( count_allele(genotypes), axis=1) if N != len(score): cexit('E: length of genotypes !+ length of score! ({} vs {})'.format(N, len(score))) index = arrange_index( score ) compress_index = np.ones( len(index), dtype=np.int8 ) # calculate r^2 r_2 = calculate_r_2( genotypes ) count = 0 # walk through index cerr('[I - pruning for {} SNPs]'.format(N)) for i in range(N): if not compress_index[i]: continue for j in index[i+1:]: if r_2[i,j] > threshold: compress_index[j] = 0 count += 1 pruned_index = np.nonzero(compress_index)[0] return pruned_index
def calculate_required_aspect(data): total_width = 1.37 * len(data['REG'].unique()) total_height = total_width / 1.42 / 2 # bad approximation of square root of 2 cerr('[I - Using figure size: {:.2f} x {:.2f} inches]'.format( total_height, total_width)) return total_width, total_height
def align( seqs, method=None, matrix='DNA', degap=True): """ aligned a list of sequences in seqs, returning a list of aligned sequences """ if len(seqs) == 2: # perform pairwise alignment from seqpy.core.pwaligner import calign if degap: s_0 = degapped( seqs[0] ) s_1 = degapped( seqs[1] ) else: s_0 = seqs[0] s_1 = seqs[1] if not method: method = 'global_cfe' a_0, a_1, score = calign.aligner(s_0.upper(), s_1.upper(), method=method, matrix=matrix) cerr('pairwise aligned with score: %f' % score) return (preserve_case(s_0, a_0), preserve_case(s_1, a_1), score) elif len(seqs) > 2: # perform multiple sequence alignment if method is None or method.startswith('muscle'): pass else: raise RuntimerError('Alignment must involve 2 or more sequences')
def save(self, fmt, prefixname=None, autofilename=False, with_position=False): # we make assumption on the type of data if self.M.dtype == np.int8: datatype = 'nalt' else: datatype = 'ralt' if autofilename: prefixname = '%s-%d-%d' % ('r' if datatype == 'ralt' else 'n', len(self.df_M.columns), len(self.M)) outmatrix = prefixname + ('.ralt' if datatype == 'ralt' else '.nalt') if fmt == 'pickle': outmatrix = outmatrix + '.pickle.gz' self.df_M.to_pickle(outmatrix) elif fmt == 'npy': outmatrix = outmatrix + '.npy.gz' with gzopen(outmatrix, 'wb') as f: a = np.array([np.array(self.df_M.columns), self.df_M.values]) np.save(f, a) else: outmatrix = outmatrix + '.txt.gz' self.df_M.to_csv(outmatrix, sep='\t', index=False) cerr('[I - writing genotype data to %s]' % outmatrix) if with_position: outpos = prefixname + '.pos.txt.gz' self.df_P.to_csv(outpos, sep='\t', index=False) cerr('[I - writing position data to %s]' % (outpos))
def geno2filtindv(args): cerr('I: reading genotype file') genoparser = tabparser.GenotypeLineParser(args) cerr('I: generating haplotypes') haplotypes = genoparser.parse_haplotypes() cerr('I: scanning haplotypes') flags = [True] * len(haplotypes) for idx, haplo in enumerate(haplotypes): missingness = haplo.count(b'-') / len(haplo) if missingness > args.cutoff: flags[idx] = False #cerr('I: %4d : %f' % (idx, haplo.count(b'-')/len(haplo)) cerr('I: filtering samples') outfile = open(args.outfile, 'w') genoparser2 = tabparser.GenotypeLineParser(args) samples = itertools.compress(genoparser2.samples, flags) outfile.write('\t'.join(samples)) outfile.write('\n') for posline, genoline in genoparser2.parse_raw_lines(): new_genotypes = itertools.compress(genoline.strip().split(), flags) outfile.write('\t'.join(new_genotypes)) outfile.write('\n') cerr('I: writing for %d samples' % flags.count(True))
def __init__(self, vcffile, chroms=None, filters='', **kwargs): self.vcffile = vcffile self._hdl = open(self.vcffile) # parse filter cerr('Filters: %s' % filters) self.filters = {} for filter_item in filters.split(','): if not filter_item: continue if '=' in filter_item: k, v = filter_item.split('=', 1) k = k.strip() self._check_keyword(k) self.filters[k.strip()] = float(v.strip()) else: filter_item = filter_item.strip() self._check_keyword(filter_item) self.filters[filter_item] = True self.sample_labels = None if ',' in chroms: chroms = [c.strip() for c in chroms.split(',')] self.chroms = chroms self.installed_filters = [ self._filter_MissingThreshold, self._filter_HetThreshold, self._filter_MAF, self._filter_MAC, ] self.dp = 0 if 'DP' not in kwargs else int(kwargs['DP']) self.ad = 0 if 'AD' not in kwargs else int(kwargs['AD']) self.init_params(**kwargs)
def D( level, text ): cf = getouterframes( currentframe() )[ 1 ] #print cf #infos = getframeinfo( cf ) if level >= debug_level: cerr("[%s] %s [%s:%s]:: %s " % (time.strftime('%H:%M:%S'), cf[3], cf[1], cf[2], text))
def do_export_ralt(M, sample_idx, site_idx, indv_idx, args): cerr('[I - exporting sample and position indexes for %d samples' % args.s) collected_samples = indv_idx[:args.s] filt_site_idx, inf_site_idx = filter_site_idx(M, collected_samples, site_idx, mac=args.mac) np.savetxt('exhqc.indv.txt', collected_samples, fmt='%d') np.savetxt('exhqc.pos.txt', inf_site_idx, fmt='%d')
def main(): greet() if len(sys.argv) == 1: usage() if sys.argv[1].endswith('.py'): # will execute a script file seqpy.cerr('Attempting to run script: %s' % sys.argv[1]) with open(sys.argv[1]) as fh: code = compile(fh.read(), sys.argv[1], 'exec') sys.argv = sys.argv[1:] _l = {} module = exec(code, None, _l) if 'main' in _l: globals().update(_l) main = _l['main'] if 'init_argparser' in _l: init_argparser = _l['init_argparser'] p = init_argparser() args = p.parse_args(sys.argv[1:]) main(args) else: main() elif sys.argv[1] == '-i': # interactive pass else: from seqpy import cmds cmds.execute(sys.argv[1:])
def barplot( args ): cerr('I: reading data...') df = pandas.read_table( args.infile ) column = df.columns[args.column - 1] cerr('I: selecting column %s' % column) if args.asc: cerr('I: sorting ascending...') df = df.sort_values( column ) elif args.desc: cerr('I: sorting descending...') df = df.sort_values( column, ascending=False) heights = df[column] cerr('I: plotting...') #plt.bar( np.arange(0, len(heights)), heights, 1.0) #plt.plot( heights ) plt.scatter( np.arange(0, len(heights)), heights, 0.25 ) if args.xlabel: plt.xlabel(args.xlabel) if args.ylabel: plt.ylabel(args.ylabel) if args.title: plt.title(args.title) plt.savefig(args.outfile, dpi = args.dpi)
def parse_haplotypes(self, maxline=-1): """ this return a list like the following: [ '0000022020', '0002020000' ] """ if not self.posfile: self.parse_position_header() M = [] for (idx, paired_line) in enumerate(zip(self.posfile, self.infile)): if maxline > 0 and idx >= maxline: break posline, genoline = paired_line if self.include_positions: posinfo = posline.strip('\n').split('\t') if (posinfo[0], posinfo[1]) not in self.include_positions: continue tokens = genoline.strip().split('\t') M.append(x[0] for x in tokens) cerr('I: haplotyping for %d SNP positions' % len(M)) # do transpose M_t = [*zip(*M)] H = [''.join(x).encode('UTF-8') for x in M_t] return H
def txt2select(args): df = pandas.read_table(args.infile, delimiter='\t', na_values=' nan') filtered_positions = {} for i in args.column.split(','): i = int(i) - 1 column = df.columns[i] cerr('I: selecting with column %s' % column) if args.minthreshold is not None: df_filtered = df[df[column] > args.minthreshold] else: df_filtered = df.nlargest(args.topmax, column) for r in df_filtered.itertuples(): filtered_positions[(r[1], int(r[2]))] = True sorted_positions = sorted(filtered_positions.keys()) with open(args.outfile, 'w') as outfile: outfile.write('CHROM\tPOS\n') for k in sorted_positions: outfile.write('%s\t%d\n' % (k)) cerr('I: writing %d positions' % len(sorted_positions))
def cross_validate(models, haplotypes, group_keys, repeats, fold, outfile, outsnp=None, logfile=None, outpred=None, procs=1): """ distribute the repeats over multi process """ start_time = time.monotonic() cerr('[I - cross_validate() for %d model(s)]' % (len(models))) seed = np.random.randint(1e8) group_keys = np.array( group_keys) if type(group_keys) != np.ndarray else group_keys arguments = [(group_keys, fold, seed + n) for n in range(repeats)] worker_func = cross_validate_worker run_worker(models, haplotypes, arguments, worker_func, procs, outfile, outsnp, logfile, outpred) cerr('[I - cross_validate() finished in %6.2f minute(s) at %s]' % ((time.monotonic() - start_time) / 60, datetime.datetime.now()))
def get_position_indexes(self, poslines): # create dictionary of [chr][pos] = index d = {} for i, line in enumerate(self.P): try: d[line[0]][line[1]] = i except KeyError: d[line[0]] = {line[1]: i} indexes = [] counter = 0 for line in poslines: if not line: continue try: counter += 1 indexes.append(d[line[0]][int(line[1])]) except KeyError: cerr('[I - warning: position not found: %s %s]' % (line[0], line[1])) cerr('[I - warning: only found %d out of %d positions]' % (len(indexes), counter)) return indexes
def pos2bed_microhaps(args, positions): if args.namecol < 0: cexit('ERR: microhaps mode needs --namecol option!') with open(args.outfile, 'w') as fout: mh_name = '' mh_seq = '' mh_1pos = -1 mh_2pos = -1 for entry in positions: seq = entry[0] pos = int(entry[1]) name = entry[args.namecol] if name == mh_name and mh_seq == seq: mh_2pos = pos continue if mh_name: fout.write('%s\t%d\t%d\t%s\n' % (mh_seq, mh_1pos, mh_2pos, mh_name)) mh_name = name mh_seq = seq mh_1pos = pos - 1 fout.write('%s\t%d\t%d\t%s\n' % (mh_seq, mh_1pos, mh_2pos, mh_name)) cerr('[I - writing microhap-based BED to %s]' % args.outfile)
def filter_mac(self, mac=1, inplace=True): # get posindex whose MAC >= mac snpindex = self.get_snpindex(mac=mac) cerr('[I - filtering MAC = %d from %d SNPs to %d SNPs]' % (mac, len(allele_mac), len(snpindex))) return self.filter_positions(snpindex, inplace)
def seq2fst(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: cexit('[ERR - seq2fst.py requires group information!]') for grp_seq in group_seqs: cerr('[I - group %s has %d sample(s)]' % (grp_seq, len(group_seqs[grp_seq]))) if args.sitefile: # perform FST site-wise FST_sites = calc_site_fst(group_seqs, args.nantozero) with open(args.sitefile, 'w') as fout: for (label, mat) in FST_sites: fout.write(label) fout.write('\t') np.savetxt(fout, mat, fmt='%5.4f', delimiter='\t', newline='\t') fout.write('\n') cerr('[I - site FST written to %s]' % (args.sitefile)) return FST_mat, groups = calc_fst(group_seqs) with open(args.outfile, 'w') as fout: fout.write('\t'.join(groups)) fout.write('\n') np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
def vcf2ped( args ): """ create a ped and map file based on vcf and metafile, suitable for isoRelate """ # open group file group_parser = grpparser.GroupParser( args ) # open VCF file cerr('[I: reading VCF...]') start_time = time.monotonic() vcfset = allel.read_vcf(args.infile, fields = ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT']) cerr('[I: read %s site, %s samples in %d secs]' % (len(vcfset['variants/CHROM']), len(vcfset['samples']), time.monotonic() - start_time)) # assign groups samples = vcfset['samples'] group_parser.assign_groups(samples) groups = group_parser.group_keys #import IPython; IPython.embed() # write to PED with open(args.outprefix + '.ped', 'w') as outf: for i in range(len(samples)): outf.write('%s\t%s\t0\t0\t1\t0\t' % (groups[i], samples[i])) alleles = [] for gt in vcfset['calldata/GT'][:,i]: allele_1, allele_2 = gt #print(allele_1, allele_2) if allele_1 == allele_2: if allele_1 == -1: alleles += [0, 0] elif allele_1 == 0: alleles += [1, 1] elif allele_1 == 1: alleles += [2, 2] else: alleles += [1, 1] else: alleles += [1, 2] outf.write('\t'.join( str(i) for i in alleles)) outf.write('\n') #import IPython; IPython.embed() # write to MAP with open(args.outprefix + '.map', 'w') as outf: last_pos = 0 curr_chr = None for (chrom, pos) in zip( vcfset['variants/CHROM'], vcfset['variants/POS'] ): if curr_chr != chrom: curr_chr = chrom last_pos = 0 dist = (pos - last_pos) * 1e-6 last_pos = pos outf.write('%s\t%s:%d\t%8.6f\t%d\n' % (chrom, chrom, pos, dist, pos))
def filter_imiss(M, site_idx, sample_idx, imiss): cerr('[I - filtering for sample missingness < %4.3f]' % imiss) check_sanity(M, site_idx, sample_idx) indv_missingness = np.count_nonzero(M < 0, axis=0) / len(site_idx) indexes = np.where( indv_missingness <= (1.0 - imiss) * indv_missingness.max() ) M2 = M[:, indexes[0]] sample_idx2 = sample_idx[ indexes[0] ] cerr('[I - keeping %d from %d samples]' % (len(sample_idx2), len(sample_idx))) return M2, site_idx, sample_idx2
def fas2table(args): msa = load(args.infile) ref = load(args.reffile) table = generate_table(msa, ref) with open(args.outfile, 'w') as fout: for (label, muts) in table: fout.write('%s/\t%s\n' % (label, ' '.join(muts))) cerr('[Writing table to %s]' % args.outfile)
def geno2fst( args ): lineparser = tabparser.GenotypeLineParser( args ) lineparser.set_translator(lineparser.diploid_translator) cout('Grouping:') groups = lineparser.parse_grouping() for k in groups: cout(' %12s %3d' % (k, len(groups[k]))) FST = [] # FST indexed by group_keys group_keys = sorted(groups.keys()) cout(group_keys) # output to file cout('Writing outfile...') outfile = open(args.outfile, 'w') outfile.write('CHROM\tPOS\tREGION\tMAX\tMEAN\tMEDIAN\tMAF\t%s\n' % '\t'.join(group_keys) ) idx = 0 for (posinfo, genolist) in lineparser.parse(): idx += 1 genoarray = allel.GenotypeArray( [genolist] ) # calculate MAF ac = genoarray.count_alleles() num = np.min(ac) denom = np.sum(ac) if num == denom: maf = 0 else: maf = np.min(ac)/np.sum(ac) # calculate FST per group against other samples fst_sites = [] for g in group_keys: ac_g = genoarray.count_alleles(subpop = groups[g]) ac_ng = genoarray.count_alleles(subpop = list( lineparser.sample_idx - set(groups[g]))) num, den = allel.stats.hudson_fst(ac_g, ac_ng) fst = num[0]/den[0] if not (0.0 <= fst <= 1.0): fst = 0 fst_sites.append( fst ) if idx % 100 == 0: cerr('I: writing position no %d' % idx) outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%5.4f\t%s\n' % (posinfo[0], posinfo[1], posinfo[4], np.max(fst_sites), np.mean(fst_sites), np.median(fst_sites), maf, '\t'.join( '%5.4f' % x for x in fst_sites)))
def filter_lmiss(M, site_idx, sample_idx, lmiss): cerr('[I - filtering for SNP missingness < %4.3f]' % lmiss) check_sanity(M, site_idx, sample_idx) site_missingness = np.count_nonzero(M < 0, axis=1) / len(sample_idx) indexes = np.where( site_missingness <= (1.0 - lmiss) * site_missingness.max()) M2 = M[ indexes[0], : ] site_idx2 = site_idx[ indexes[0] ] cerr('[I - keeping %d from %d sites]' % (len(site_idx2), len(site_idx))) #import IPython; IPython.embed() return M2, site_idx2, sample_idx
def ralt2nalt(args): ralt_parser = naltparser.NAltLineParser(args, datatype='ralt', with_group=False, with_position=False) region = ralt_parser.parse_whole() # convert to n_alt cerr('[I - converting to nalt format]') cerr('[ M dtype: {}]'.format(region.M.dtype)) region.ralt_to_nalt(hetratio=args.hetratio if not args.major else -1) cerr('[ M dtype: {}]'.format(region.M.dtype)) region.save(args.outfmt, prefixname=args.outfile, autofilename=args.autofilename, with_position=False) return # write to outfile with open(args.outfile, 'w') as outfile: # write header outfile.write(ralt_parser.get_sample_header()) outfile.write('\n') np.savetxt(outfile, region.M, fmt='%d', delimiter='\t') cerr('[I: finish writing to %s' % args.outfile)
def ralt2iterqc( args ): cerr('[I - reading input files]') start_time = time.monotonic() df = pd.read_csv(args.infile, sep='\t', dtype=float, nrows=args.n if args.n > 0 else None) samples = df.columns sample_idx = np.arange(len(samples)) M = df.values site_idx = np.arange(len(M)) cerr('[I - reading %d sites for %d samples in %d secs]' % (len(site_idx), len(sample_idx), time.monotonic() - start_time)) for i in range(args.iter): cerr('[I - ITER -> %d]' % (i+1)) site_N = len(site_idx) sample_N = len(sample_idx) if args.lmiss > 0: M, site_idx, sample_idx = filter_lmiss(M, site_idx, sample_idx, args.lmiss) if args.imiss > 0: M, site_idx, sample_idx = filter_imiss(M, site_idx, sample_idx, args.imiss) if args.mac > 0: M, site_idx, sample_idx = filter_mac(M, site_idx, sample_idx, args.mac) if site_N == len(site_idx) and sample_N == len(sample_idx): cerr('[I - filtering has converged]') break
def assign_groups(self, samples): if not self.group_info: self.parse() groups = {} sample_idx = [] group_keys = [] for idx, code in enumerate(samples): grp_key = self.group_info[code] if grp_key in groups: groups[grp_key].append(idx) else: groups[grp_key] = [idx] sample_idx.append(idx) group_keys.append(grp_key) self.samples = samples self.sample_idx = set(sample_idx) self.groups = groups self.group_keys = group_keys if self.colourfile: # parse colour file self.colourfile.seek(0) next(self.colourfile) for line in self.colourfile: tokens = line.strip().split('\t') self.group_colours[tokens[0]] = tokens[1] # checking whether all groups has been assigned with colours for k in self.groups: if k not in self.group_colours: cexit('E: group %s is not assigned' % k) cerr('[I: assigning manual colours to %d groups]' % (len(self.group_colours))) else: colour_wheel = cycle(colour_list) for k in sorted(self.groups.keys()): self.group_colours[k] = next(colour_wheel) if len(self.groups.keys()) > len(colour_list): cerr( "W: warning, no of groups (%d) exceeds available colour list!" % len(self.groups.keys())) return self.groups