def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj indexes = [] counter = 0 for s in container: counter += 1 new_label = '%04d' % counter indexes.append((new_label, s.label)) s.label = new_label if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or []) if args.tabfile: with open(args.tabfile, 'w') as f: for i in indexes: f.write('%s\t%s\n' % i)
def view(self, obj): frame_class = None m = None print(obj) if hasattr(obj, 'edit_bases'): print(obj) from insane.core.trace.frame import TraceFrame from insane.core.trace.model import tracemodel m = tracemodel(obj) frame_class = TraceFrame elif hasattr(obj, 'type'): from insane.core.msa.frame import SequenceFrame from insane.core.msa.model import MSA m = MSA( obj ) frame_class = SequenceFrame cout('MSA prepared') if frame_class and m is not None: frame = frame_class( self.get_mainwin().default_env, m) cout('Frame created') win = self.get_mainwin() if win.mainframe() is None: win.setWindowTitle( m.filename() + ' - seqpy/InSAnE' ) win.show_centralwidget( frame ) else: from insane.core.main.mainwin import IMainWindow w = IMainWindow() w.setWindowTitle( m.filename() + ' - seqpy/InSAnE' ) w.show_centralwidget( frame ) w.show()
def start_app(arg): global app cout("Starting GUI\n") if not app: app = QtGui.QApplication.instance() if not app: if type(arg) == list: app = QtGui.QApplication(arg) else: app = QtGui.QApplication(['__builtin__']) w = IMainWindow() if type(arg) == list and len(arg) >= 2: w.load(arg[1]) elif type(arg) == str: w.load(arg) else: cout('viewing...') w.view(arg) #w.setFocus() w.show() app.exec_() try: w.hide() del w except RuntimeError: pass
def main(args): aaseqs = bioio.multisequence() if args.start_sequence: args.start_sequence = args.start_sequence.upper().encode('ASCII') for infile in args.files: mseq = bioio.load(infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() if args.start_sequence: # we use search restriction pattern function to locate # the position target_seq = funcs.uppercased(funcs.degapped(seq)) res = funcs.search_restriction_site(target_seq, args.start_sequence) if len(res) != 1: continue print(target_seq[res[0][0]:res[0][0] + 30]) aaseq.set_sequence( funcs.translated(target_seq, start_pos=res[0][0] + 1)) else: aaseq.set_sequence( funcs.translated(seq, start_pos=args.start_codon)) aaseqs.append(aaseq) bioio.save(aaseqs, args.outfile)
def __init__(self, view, blink=True): super(SequenceCaret, self).__init__() cout('SequenceCaret.__init__() executed!') assert view._caret is None # set the view which we are anchoring to self._view = view # cursor idx, pos coordinate self.cur_idx = -1 self.cur_pos = -1 self.next_idx = -1 self.next_pos = -1 # cursor x,y coordinate self.cur_x = -1 # translate from pos self.cur_y = -1 # translate from idx # cursor size self.w = -1 self.h = -1 # blinking purposes self._blink = blink self._counter = 0 self._timerid = None self._visible = False self._revimg = None self._norimg = None
def align(seqs, method=None, matrix='DNA'): """ aligned a list of sequences in seqs, returning a list of aligned sequences """ if len(seqs) == 2: # perform pairwise alignment from seqpy.core.pwaligner import calign s_0 = degapped(seqs[0]) s_1 = degapped(seqs[1]) if not method: method = 'global_cfe' a_0, a_1, score = calign.aligner(s_0.upper(), s_1.upper(), method=method, matrix=matrix) cout('pairwise aligned with score: %f' % score) return (preserve_case(s_0, a_0), preserve_case(s_1, a_1), score) elif len(seqs) > 2: # perform multiple sequence alignment if method is None or method.startswith('muscle'): pass else: raise RuntimerError('Alignment must involve 2 or more sequences')
def main( args ): mseq = bioio.load( args.infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), args.infile)) c_mseq = funcs.condensed( mseq ) bioio.save( c_mseq, args.outfile ) if args.report: write_report(c_mseq, args.report)
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.accno: set_label_to_accno(container) if args.degap: container.degap() if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0: new_container = bioio.multisequence() for s in container: if args.minlen > 0 and len(s) < args.minlen: continue if args.maxlen > 0 and len(s) > args.maxlen: continue if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN: continue new_container.append(s) container = new_container if args.sort: if args.sort.startswith('len'): container.sort(lambda x: len(x), reverse=True) elif args.sort.startswith('lab'): container.sort(lambda x: x.label) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def main( args ): aaseqs = bioio.multisequence() for infile in args.files: mseq = bioio.load( infile, options = args.io_opts ) cout('reading %d sequences from %s' % (len(mseq), infile)) for seq in mseq: aaseq = seq.clone() aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) ) aaseqs.append( aaseq ) bioio.save( aaseqs, args.outfile )
def statseq( args ): mseq = bioio.load( args.infile, options = args.io_opts or [] ) for s in mseq: seq = s.seq.upper() A_ = seq.count(b'A') C_ = seq.count(b'C') G_ = seq.count(b'G') T_ = seq.count(b'T') N_ = seq.count(b'N') d_ = seq.count(b'-') L = A_ + C_ + G_ + T_ + N_ + d_ cout('A: %3d C: %3d G: %3d T: %3d N: %3d -: %3d L: %3d | \t%s' % (A_, C_, G_, T_, N_, d_, L, s.label))
def geno2genediv( args ): lineparser = tabparser.GenotypeLineParser( args ) lineparser.set_translator(lineparser.diploid_translator) # set group groups = lineparser.parse_grouping() cout('Grouping:') group_keys = sorted(groups.keys()) for k in group_keys: cout(' %12s %3d' % (k, len(groups[k]))) outfile = open(args.outfile, 'wt') outfile.write('CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tFST\tdHe\tHe\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' % '\t'.join( group_keys )) for idx, region in enumerate(lineparser.parse_genes()): haplotypes = set( region.haplotypes()) enc_haplos = region.encode_haplotypes() haploarray = allel.HaplotypeArray( [enc_haplos] ) cerr( 'I: calculating %d - %s' % (idx, region.name)) # calculate total He first He = 1 - np.sum( haploarray.count_alleles().to_frequencies()**2 ) # calculate He per population, He_p values = [] pHe = 0 for g in group_keys: he_p = 1 - np.sum( haploarray.count_alleles(subpop=groups[g]).to_frequencies()**2 ) pHe += he_p * len(groups[g]) values.append(he_p) dHe = He - pHe / sum( len(x) for x in groups.values() ) FST = dHe/He #print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value) params = ( FST, dHe, He, np.mean(values), np.median(values), np.max(values), np.min(values)) outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' % ( region.P[0][0], region.P[0][1], region.name, len(region.P), len(haplotypes), '\t'.join( '%5.4f' % x for x in params), '\t'.join( '%5.4f' % x for x in values)))
def _filter_HetThreshold(self, snp_info, data_items): """ This filters the proportion of samples with heterozygote SNP at particular SNP position """ if 'HetThreshold' in self.filters: # count heterozygosity hets = 0 for (idx, data_item) in data_items: gt = data_item[0] if gt not in ['0/0', '1/1', '2/2', '3/3', '0', '1', '2', '3']: hets += 1 if hets / len(data_items) >= self.filters['HetThreshold']: cout('SNP ID: %s did not pass heterozygosity threshold.' % snp_info[2]) return False return True
def geno2pwfst(args): """ perform pair-wise FST by population """ lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.diploid_translator) lineparser.parse_grouping() cout('Grouping:') groups = lineparser.groups for k in lineparser.groups: cout(' %12s %3d' % (k, len(lineparser.groups[k]))) FST = [] # FST indexed by group_keys group_keys = sorted(lineparser.groups.keys()) # read whole genotype, and release all unused memory cerr('I: reading genotype file') allel_array = lineparser.parse_all() cerr('I: generating genotype array') genoarray = allel.GenotypeArray(allel_array) del allel_array cerr('I: counting alleles') ac = {} for g in group_keys: ac[g] = genoarray.count_alleles(subpop=groups[g]) cerr('I: calculating FST') M = np.zeros((len(group_keys), len(group_keys))) for (i, j) in itertools.permutations(range(len(group_keys)), 2): i_group = group_keys[i] j_group = group_keys[j] fst, _, _, _ = allel.stats.blockwise_hudson_fst(ac[i_group], ac[j_group], blen=10) M[i, j] = M[j, i] = fst with open(args.outfile, 'wt') as outfile: # write header: outfile.write('%s\n' % ('\t'.join(group_keys))) np.savetxt(outfile, M, delimiter='\t') return
def seq2pi(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: group_seqs = {'ALL': seqs} print('Groups:') outf = open(args.outfile, 'w') if args.outfile else None if outf: outf.write('GROUP\tN\tPI\tSTDDEV\n') for g in group_seqs: avg, stddev = calc_pi(group_seqs[g]) cout(' %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev)) if outf: outf.write('%s\t%d\t%5.4f\t%5.4f\n' % (g, len(group_seqs[g]), avg, stddev)) if outf: cerr('[I - result written to %s' % args.outfile)
def geno2genediv(args): lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.diploid_translator) # set group groups = lineparser.parse_grouping() cout('Grouping:') group_keys = sorted(groups.keys()) for k in group_keys: cout(' %12s %3d' % (k, len(groups[k]))) outfile = open(args.outfile, 'wt') outfile.write( 'CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' % '\t'.join(group_keys)) for idx, region in enumerate(lineparser.parse_genes()): haplotypes = set(region.haplotypes()) enc_haplos = region.encode_haplotypes() assert len(haplotypes) == max(enc_haplos) + 1 haploarray = allel.HaplotypeArray([enc_haplos]) cerr('I: calculating %d - %s' % (idx, region.name)) value = [] for g in group_keys: ac_g = haploarray.count_alleles(subpop=groups[g]) ac_ng = haploarray.count_alleles( subpop=list(lineparser.sample_idx - set(groups[g]))) num, den = allel.stats.hudson_fst(ac_g, ac_ng) value.append(den) #print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value) params = (np.mean(value), np.median(value), np.max(value), np.min(value)) outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' % (region.P[0][0], region.P[0][1], region.name, len(region.P), len(haplotypes), '\t'.join( '%5.4f' % x for x in params), '\t'.join('%5.4f' % x for x in value)))
def _filter_MissingThreshold(self, snp_info, data_items): """ This filters the proportion of samples with missing SNP at particular SNP position """ if 'MissingThreshold' in self.filters: # count missing haplotype missing = 0 for (idx, data_item) in data_items: gt = data_item[0] if gt in ['./.', '.']: missing += 1 if missing / len(data_items) >= self.filters['MissingThreshold']: cout('SNP ID: %s did not pass missing threshold.' % snp_info[2]) return False return True
def geno2fst( args ): lineparser = tabparser.GenotypeLineParser( args ) lineparser.set_translator(lineparser.diploid_translator) cout('Grouping:') groups = lineparser.parse_grouping() for k in groups: cout(' %12s %3d' % (k, len(groups[k]))) FST = [] # FST indexed by group_keys group_keys = sorted(groups.keys()) cout(group_keys) # output to file cout('Writing outfile...') outfile = open(args.outfile, 'w') outfile.write('CHROM\tPOS\tREGION\tMAX\tMEAN\tMEDIAN\tMAF\t%s\n' % '\t'.join(group_keys) ) idx = 0 for (posinfo, genolist) in lineparser.parse(): idx += 1 genoarray = allel.GenotypeArray( [genolist] ) # calculate MAF ac = genoarray.count_alleles() num = np.min(ac) denom = np.sum(ac) if num == denom: maf = 0 else: maf = np.min(ac)/np.sum(ac) # calculate FST per group against other samples fst_sites = [] for g in group_keys: ac_g = genoarray.count_alleles(subpop = groups[g]) ac_ng = genoarray.count_alleles(subpop = list( lineparser.sample_idx - set(groups[g]))) num, den = allel.stats.hudson_fst(ac_g, ac_ng) fst = num[0]/den[0] if not (0.0 <= fst <= 1.0): fst = 0 fst_sites.append( fst ) if idx % 100 == 0: cerr('I: writing position no %d' % idx) outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%5.4f\t%s\n' % (posinfo[0], posinfo[1], posinfo[4], np.max(fst_sites), np.mean(fst_sites), np.median(fst_sites), maf, '\t'.join( '%5.4f' % x for x in fst_sites)))
def calc_fst(mseqs): groups = list(mseqs.keys()) len_grp = len(groups) FST_mat = np.zeros((len_grp, len_grp)) allele_counts = count_allele(mseqs) for i, j in itertools.combinations(range(len_grp), 2): ac1 = allele_counts[groups[i]] ac2 = allele_counts[groups[j]] with np.errstate(divide='ignore', invalid='ignore'): num, den = allel.hudson_fst(ac1, ac2) FST_mat[i, j] = np.nanmean(num / den) #np.sum(num) / np.sum(den) FST_mat[j, i] = np.nanstd(num / den) cout('%5.4f +- %5.4f : %s <> %s' % (FST_mat[i, j], FST_mat[j, i], groups[i], groups[j])) return FST_mat, groups
def groupinfo(args): # open and read the first line of infile if args.fmt in ['pickle', 'npy']: from seqpy.core.bioio import naltparser from types import SimpleNamespace nalt_args = SimpleNamespace(infile=args.infile, fmt=args.fmt, n=-1) nalt_parser = naltparser.NAltLineParser(nalt_args, with_group=False, with_position=False) samples = nalt_parser.samples elif args.fmt == 'list': with gzopen(args.infile) as f: buf = f.read() samples = buf.split() else: with gzopen(args.infile) as f: samples = f.readline().strip().split() group_parser = grpparser.GroupParser(args) groups = group_parser.assign_groups(samples) total = 0 cout('Groups:') for g in sorted(groups.keys()): c = len(groups[g]) cout(' %3d - %s' % (c, g)) total += c cout('Total: %d samples' % total)
def main(): app = QtWidgets.QApplication(sys.argv) # patching seqpy.cout set_cout(writelog) cout('console log ready..') try: infile = sys.argv[1] except IndexError: infile = None w = IMainWindow() w.show() if infile: # allow all windows to be drawn QtCore.QTimer.singleShot(100, lambda: w.load(infile)) else: w.setFocus() app.exec_()
def main(args): container = None for infile in args.files: obj = bioio.load(infile, options=args.io_opts or []) cout('reading %d sequences from %s' % (len(obj), infile)) if container is None: container = obj else: container += obj append_attributes(container, args.src, args.src_isolate, args.definition) if args.summary: for s in container: seq = s.seq.upper() print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" % (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'), seq.count(b'G'), seq.count(b'T'), seq.count(b'-'))) if args.outfile: bioio.save(container, args.outfile, options=args.io_opts or [])
def geno2pairfst( args ): lineparser = tabparser.GenotypeLineParser( args ) lineparser.set_translator(lineparser.diploid_translator) cout('Grouping:') groups = lineparser.parse_grouping() for k in groups: cout(' %12s %3d' % (k, len(groups[k]))) FST = [] # FST indexed by group_keys group_keys = sorted(groups.keys()) cout(group_keys) # gathering groups grp1 = list(itertools.chain.from_iterable( groups[k] for k in args.grp1.split(',') )) grp2 = list(itertools.chain.from_iterable( groups[k] for k in args.grp2.split(',') )) # output to file FST = [] idx = 0 for (posinfo, genolist) in lineparser.parse(): idx += 1 genoarray = allel.GenotypeArray( [genolist] ) # calculate FST per group against other samples ac_g1 = genoarray.count_alleles(subpop = grp1) ac_g2 = genoarray.count_alleles(subpop = grp2) num, den = allel.stats.hudson_fst(ac_g1, ac_g2) fst = num[0]/den[0] if not (0.0 <= fst <= 1.0): fst = 0 FST.append( (fst, posinfo) ) FST.sort(reverse=True) for fst, posinfo in FST[:10]: cout('%s\t%s\t%s\t%5.4f' % (posinfo[0], posinfo[1], posinfo[4], fst))
def file_open(self, filename=None): if not filename: filename, file_filter = QtWidgets.QFileDialog.getOpenFileName( self.pane(), "Open project, alignment or trace file" ) if not filename: return cout("Loading file %s" % filename) if not os.path.exists( filename ): alert('File %s does not exists. Please check your filename!' % filename) return b = progress('Opening ' + filename) b.repaint() obj = bioio.load( filename ) b.hide() del b if obj: self.view(obj) else: alert('Error reading file ' + filename +'\nUnknown file format!')
def geno2dhe(args): lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.haploid_translator) lineparser.parse_grouping() cout('Grouping:') groups = lineparser.groups for k in lineparser.groups: cout(' %12s %3d' % (k, len(lineparser.groups[k]))) group_keys = sorted(lineparser.groups.keys()) cout(group_keys) # read whole genotype, and release all unused memory cerr('I: reading genotype file') allel_array = lineparser.parse_all() cerr('I: generating genotype array') genoarray = allel.GenotypeArray(allel_array) del allel_array cerr('I: calculating He') He = 1 - np.sum(genoarray.count_alleles().to_frequencies()**2, axis=1) He_groups = {} pHe = None for g in groups: He_groups[g] = 1 - np.sum( genoarray.count_alleles(subpop=groups[g]).to_frequencies()**2, axis=1) if pHe is None: pHe = He_groups[g] * len(groups[g]) else: pHe = pHe + He_groups[g] * len(groups[g]) dHe = He - pHe / sum(len(x) for x in groups.values()) FST = dHe / He #import IPython; IPython.embed() cerr('I: writing output file') with open(args.outfile, 'wt') as outfile: outfile.write('CHROM\tPOS\tREGION\tFST\tdHe\tHe\t%s\n' % '\t'.join(group_keys)) for i in range(len(He)): posinfo = lineparser.position[i] outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%s\n' % (posinfo[0], posinfo[1], posinfo[4], FST[i], dHe[i], He[i], '\t'.join('%5.4f' % He_groups[g][i] for g in group_keys)))
def vcf2seq(args): vcf2seqhelper = VCF2SeqHelper( args.vcffile, args.chr, 'NoIndel,LowQual,MissingThreshold=0.05,HetThreshold=0.25,' + args.opts) vcf2seqhelper.parse() mseq = vcf2seqhelper.get_multisequence() cout('Report:') for k, v in vcf2seqhelper.chr_used.items(): cout(' %s\t%d' % (k, v)) cout('Writing to %s' % args.outfile) bioio.save(mseq, args.outfile)
def geno2dxy(args): lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.haploid_translator) lineparser.parse_grouping() cout('Grouping:') groups = lineparser.groups for k in lineparser.groups: cout(' %12s %3d' % (k, len(lineparser.groups[k]))) group_keys = sorted(lineparser.groups.keys()) cout(group_keys) # read whole genotype, and release all unused memory cerr('I: reading genotype file') allel_array = lineparser.parse_all() cerr('I: generating genotype array') genoarray = allel.GenotypeArray(allel_array) del allel_array cerr('I: generating allele count array') gac = genoarray.to_allele_counts() cerr('I: calculating pairwise dxy') c_distm = allel.pairwise_dxy(range(len(gac)), gac) distm = scipy.spatial.distance.squareform(c_distm) #import IPython #IPython.embed() cerr('I: writing to outfile') with open(args.outfile, 'wb') as outfile: outfile.write(lineparser.get_sample_header(True)) outfile.write(b'\n') # write the matrix np.savetxt(outfile, distm, delimiter='\t') #, fmt='%.5f')
def geno2hierfst(args): genoparser = tabparser.GenotypeLineParser(args) genoparser.set_translator(genoparser.diploid_translator) cerr('Grouping:') groups = genoparser.parse_grouping() for k in groups: cout(' %12s %3d' % (k, len(groups[k]))) hierarchy = [] with open(args.hierfile) as hierfile: for line in hierfile: line = line.strip() if not line: continue if line.startswith('#'): continue partitions = line.split('\t') print(partitions) par1 = list( itertools.chain.from_iterable( groups[k] for k in partitions[0].split(','))) par2 = list( itertools.chain.from_iterable( groups[k] for k in partitions[1].split(','))) hierarchy.append((par1, par2)) cerr('[I: preparing %d hierarchy]' % len(hierarchy)) cerr('[I: reading genotype file...]') genotypes = genoparser.parse_all() genoarray = allel.GenotypeArray(genotypes) #import IPython; IPython.embed() del genotypes selected_positions = [] c = 1 for (grp1, grp2) in hierarchy: cerr('[I: processing hierarchy #%d]' % c) FST = [] ac_g1 = genoarray.count_alleles(subpop=grp1) ac_g2 = genoarray.count_alleles(subpop=grp2) #import IPython; IPython.embed() num, den = allel.stats.hudson_fst(ac_g1, ac_g2) fst = num / den for p, v in zip(genoparser.position, fst): if not (0.0 <= v <= 1.0): v = 0 FST.append((v, p)) FST.sort(reverse=True) cumulative_fst = 0.0 for (v, p) in FST: if v < args.minfst: break if cumulative_fst > args.cumfst: break selected_positions.append((p, v)) cumulative_fst += v c += 1 for (p, v) in selected_positions: cout('%s\t%s\t%s\t%5.4f' % (p[0], p[1], p[4], v))
def parse_samples(self, snp_info, data_items): if not super().parse_samples(snp_info, data_items): return (chrom, pos, posid, ref, alt, qual, filters, info, format) = snp_info if 'MissingThreshold' in self.filters: # count missing haplotype missing = 0 for (idx, data_item) in data_items: gt = data_item[0] if gt == './.': missing += 1 if missing / len(data_items) >= 0.05: cout('SNP ID: %s did not pass missing threshold.' % posid) return if 'HetThreshold' in self.filters: # count heterozygosity hets = 0 for (idx, data_item) in data_items: gt = data_item[0] if gt not in ['0/0', '1/1', '2/2']: hets += 1 if hets / len(data_items) >= 0.33: cout('SNP ID: %s did not pass heterozygosity threshold.' % posid) return if 'MAF' in self.filters: # count MAF refs = 0 for (idx, data_item) in data_items: gt = data_item[0] if gt == '0/0': refs += 1 maf = refs / len(data_items) if maf > 0.5: maf = 1 - maf if maf < self.filters['MAF']: print('SNP ID: %s did not pass MAF threshold.' % posid) return for (idx, data_item) in data_items: gt = data_item[0] if gt == '0/0': self.mseq[idx].append(ref[0]) elif gt == '1/1': self.mseq[idx].append(alt[0]) elif gt == '2/2': self.mseq[idx].append(alt[1]) else: self.mseq[idx].append(ord('N')) # reporting purposes self.chr_used[chrom] += 1
def consolidate_predictions(args): outreport = None if args.samplefile: samples = read_samplefile(args.samplefile, args.fmt) else: samples = None group_parser = grpparser.GroupParser(args) group_parser.assign_groups(samples) #group_parser.group_keys contains [ 'grp1', 'grp2', etc] group_keys = group_parser.group_keys with open(args.infile, 'rb') as f: predictions = pickle.load(f) if args.outreport: outreport = open(args.outreport, 'wb') from sklearn.metrics import confusion_matrix reports = {} normalize = True for model in predictions: model_pred = predictions[model] for k in model_pred: cerr('Preparing for model: {} k: {}'.format(model, k)) df = generate_dataframe(model_pred[k]) group_indexes = np.argmax(df.values, axis=1) group_predictions = df.columns[group_indexes[:, None]] for i in range(len(group_indexes)): predicted_group = df.columns[group_indexes[i]] prediction_confidence = df.values[i, group_indexes[i]] if prediction_confidence < args.threshold or predicted_group != group_keys[ i]: cout('{}: {} -> {} ({})'.format(samples[i], group_keys[i], predicted_group, prediction_confidence)) if outreport: score = lkmodels.calculate_scores(group_keys, group_predictions) confmat = confusion_matrix(group_keys, group_predictions) if normalize: confmat = confmat.astype('float') / confmat.sum( axis=1)[:, np.newaxis] cerr("[I - Normalized confusion matrix]") else: cerr('[I - Confusion matrix, without normalization]') reports['{}|{}'.format(model, k)] = { 'score': score, 'confmat': confmat } if outreport: pickle.dump(reports, outreport) cerr('[I - writing pickled report to {}]'.format(args.outreport))
def main(args): mseq = bioio.load(args.infile, options=args.io_opts) cout('reading %d sequences from %s' % (len(mseq), args.infile)) bioio.save(funcs.condensed(mseq), args.outfile)