def get_sample_header(self, bytestring=False): if not self.samples: cexit('E: need to parse sample header first') header = '\t'.join(self.samples) if bytestring: return header.encode('UTF-8') return header
def pos2bed_microhaps(args, positions): if args.namecol < 0: cexit('ERR: microhaps mode needs --namecol option!') with open(args.outfile, 'w') as fout: mh_name = '' mh_seq = '' mh_1pos = -1 mh_2pos = -1 for entry in positions: seq = entry[0] pos = int(entry[1]) name = entry[args.namecol] if name == mh_name and mh_seq == seq: mh_2pos = pos continue if mh_name: fout.write('%s\t%d\t%d\t%s\n' % (mh_seq, mh_1pos, mh_2pos, mh_name)) mh_name = name mh_seq = seq mh_1pos = pos - 1 fout.write('%s\t%d\t%d\t%s\n' % (mh_seq, mh_1pos, mh_2pos, mh_name)) cerr('[I - writing microhap-based BED to %s]' % args.outfile)
def dist2clonalqc(args): # read distance matrix df = pd.read_csv(args.infile, sep='\t') samples = df.columns D = df.values # read quality file or pickled ralt/nalt file if args.datafile: nalt_args = SimpleNamespace(infile=args.datafile, fmt=args.fmt, n=-1) nalt_parser = naltparser.NAltLineParser(nalt_args, with_group=False, with_position=False) region = nalt_parser.parse_whole() qual = np.count_nonzero(region.M == -1, axis=0) else: cexit('ERR: other input file has not been defined') clonal_samples = clonal_index(D, qual.max() - qual, samples, args.threshold) cerr('[I - removing %d clonal samples]' % len(clonal_samples)) if args.outfile: np.savetxt(args.outfile, samples[clonal_samples], fmt='%s')
def prune_2(genotypes, positions, threshold=0.5, score=None): """ prune by r^2 except on CDS, only CDS within the same segment/region will be pruned """ if score == None: # we use MAC as default score score = np.min( genotypes.count_alleles(), axis=1) N = len(genotypes) if N != len(score) or N != len(positions): cexit('E: length of genotypes != length of score nor positions!') index = arrange_index( score ) compress_index = np.ones( len(index), dtype=np.int8 ) # calculate r^2 r_2 = calculate_r_2( genotypes ) # walk through index cerr('I: scanning r^2 matrix') for i in range(N): if not compress_index[i]: continue for _, j in scoring_index[i+1:]: if r_2[i,j] > threshold: # check if this is a CDS region if positions[j][4] and positions[j][4] != positions[i][4]: continue compress_index[j] = 0 return compress_index
def prune_1(genotypes, threshold=0.5, score=None): """ prune by r^2 with score as priority, returning indexing array """ N = len(genotypes) if score == None: # we use MAC as default score score = np.min( count_allele(genotypes), axis=1) if N != len(score): cexit('E: length of genotypes !+ length of score! ({} vs {})'.format(N, len(score))) index = arrange_index( score ) compress_index = np.ones( len(index), dtype=np.int8 ) # calculate r^2 r_2 = calculate_r_2( genotypes ) count = 0 # walk through index cerr('[I - pruning for {} SNPs]'.format(N)) for i in range(N): if not compress_index[i]: continue for j in index[i+1:]: if r_2[i,j] > threshold: compress_index[j] = 0 count += 1 pruned_index = np.nonzero(compress_index)[0] return pruned_index
def __init__(self, model_id, k, guide_tree=None, min_fst=0.9, ultimate_fst=1.0, priority=None, max_leaf_snp=0, snpindex=None, iteration=1, seed=None): super().__init__(model_id, k, snpindex, iteration, seed) """ min_fst: the minimum FST before using select_2() (alternate SNP selection) ultimate_fst: the minimun FST to be prioritized for selection priority: index of SNPs to be prioritized for selection (after filtering by ultimate_fst) """ if guide_tree is None: cexit('[E - HierarchicalFSTSelector requires guide tree]') self.guide_tree = guide_tree self.min_fst = min_fst self.priority = priority self.max_leaf_snp = max_leaf_snp self.ultimate_fst = ultimate_fst
def check_sanity(M, site_idx, sample_idx): shape = M.shape # sanity checking if len(sample_idx) != shape[1]: print( len(sample_idx), shape ) cexit('[E - inconsistent M shape and no of samples!]') if len(site_idx) != shape[0]: cexit('[E - inconsistent M shape and no of sites!]')
def seq2fst(args): # open and read sequence file cerr('[I - reading sequence file %s]' % args.infile) seqs = load(args.infile) # open and read group/meta file using groupfile/metafile if available if args.groupfile or args.metafile: cerr('[I - reading group information file]') group_parser = grpparser.GroupParser(args) group_parser.parse() group_seqs = {} for seq in seqs: try: grp = group_parser.group_info[seq.label.decode('ASCII')] except KeyError: cerr('[W - sample %s is not assign to any group]' % seq.label.decode('ASCII')) continue if grp in group_seqs: group_seqs[grp].append(seq) else: ms = multisequence() ms.append(seq) group_seqs[grp] = ms else: cexit('[ERR - seq2fst.py requires group information!]') for grp_seq in group_seqs: cerr('[I - group %s has %d sample(s)]' % (grp_seq, len(group_seqs[grp_seq]))) if args.sitefile: # perform FST site-wise FST_sites = calc_site_fst(group_seqs, args.nantozero) with open(args.sitefile, 'w') as fout: for (label, mat) in FST_sites: fout.write(label) fout.write('\t') np.savetxt(fout, mat, fmt='%5.4f', delimiter='\t', newline='\t') fout.write('\n') cerr('[I - site FST written to %s]' % (args.sitefile)) return FST_mat, groups = calc_fst(group_seqs) with open(args.outfile, 'w') as fout: fout.write('\t'.join(groups)) fout.write('\n') np.savetxt(fout, FST_mat, fmt='%5.4f', delimiter='\t')
def execute(args): if len(args) < 1: usage() command = args[0] M = importlib.import_module('seqpy.cmds.' + command) print(M) parser = M.init_argparser() if not parser: seqpy.cexit('Fatal ERR: init_argparser() does not return properly') args = parser.parse_args(args[1:]) M.main(args)
def __init__(self, args): # positions if args.posfile is None: cexit('ERROR: required --posfile') self.posfilename = args.posfile self.posfile = None self.posfile_header = None self.positions = None self.header = None self.n = args.n self.df = None self.M = None
def assign_groups(self, samples): if not self.group_info: self.parse() groups = {} sample_idx = [] group_keys = [] for idx, code in enumerate(samples): grp_key = self.group_info[code] if grp_key in groups: groups[grp_key].append(idx) else: groups[grp_key] = [idx] sample_idx.append(idx) group_keys.append(grp_key) self.samples = samples self.sample_idx = set(sample_idx) self.groups = groups self.group_keys = group_keys if self.colourfile: # parse colour file self.colourfile.seek(0) next(self.colourfile) for line in self.colourfile: tokens = line.strip().split('\t') self.group_colours[tokens[0]] = tokens[1] # checking whether all groups has been assigned with colours for k in self.groups: if k not in self.group_colours: cexit('E: group %s is not assigned' % k) cerr('[I: assigning manual colours to %d groups]' % (len(self.group_colours))) else: colour_wheel = cycle(colour_list) for k in sorted(self.groups.keys()): self.group_colours[k] = next(colour_wheel) if len(self.groups.keys()) > len(colour_list): cerr( "W: warning, no of groups (%d) exceeds available colour list!" % len(self.groups.keys())) return self.groups
def parse_np_haplotypes(self, maxline=-1): """ this return a numpy array haplotypes [ [0, 0, 0, 0, 2, 2, 0, 2, 0], [0, 0, 0, 2, 0, 0, -1, -0. -1] ] """ token2value = {'0': 0, '1': 1, '2': 2, '-': -1} S = len(self.samples) if self.include_positions: M = np.zeros((S, len(self.include_positions)), np.int8) l = 0 for (idx, paired_line) in enumerate(zip(self.posfile, self.infile)): posline, genoline = paired_line posinfo = posline.strip('\n').split('\t') if (posinfo[0], posinfo[1]) in self.include_positions: tokens = genoline.strip().split('\t') if len(tokens) != S: cexit('E: inconsistent number of samples!') for i in range(S): M[i, l] = token2value[tokens[i][0]] l += 1 else: # we need to parse positions first positions = self.parse_position() L = maxline if maxline > 0 else len(positions) M = np.zeros((S, L), np.int8) for (idx, genoline) in enumerate(self.infile): if idx >= L: break tokens = genoline.strip().split('\t') if len(tokens) != S: raise RuntimeError('E: inconsistent number of samples!') for i in range(S): M[i, idx] = token2value[tokens[i][0]] return M
def __init__(self, model_id, k, guide_tree=None, min_fst=0.9, priority=None, max_leaf_snp=0, snpindex=None, iteration=1, seed=None): super().__init__(model_id, k, snpindex, iteration, seed) if guide_tree is None: cexit('[E - HierarchicalFSTSelector requires guide tree]') self.guide_tree = guide_tree self.min_fst = min_fst self.priority = priority self.max_leaf_snp = max_leaf_snp
def parse(self): """ this is a generator, returning (pos_info, geno_array) """ if not self.posfile: self.parse_position_header() for (idx, paired_line) in enumerate(zip(self.posfile, self.infile)): pos_line, geno_line = paired_line tokens = geno_line.strip().split('\t') pos_info = pos_line.strip('\n').split('\t') if len(tokens) != len(self.samples): cexit( 'E: genotype file does not match sample number at line %d' % idx) g = self.translate(tokens) yield ((pos_info, g))
def lkest(args, config=None): # check for config as dictionary if config: # general args.mode = config.get('mode', None) or args.mode try: func = mode[args.mode] except KeyError: cexit('[E - mode %s does not exist!]' % args.mode) # run mode start_time = time.monotonic() func(args) cerr('[I - finished in %6.2f minute(s) at %s]' % ((time.monotonic() - start_time)/60, datetime.datetime.now()))
def get_model(args): if args.method == 'rand': return RandomSelector() elif args.method == 'dt': return DecisionTreeSelector() elif args.method == 'hfst': return HierarchicalFSTSelector( guide_tree = parse_guide_tree( open(args.guidetree) ) ) elif args.method == 'hfst+dt': return HHFSTDTSelector( guide_tree = parse_guide_tree( open(args.guidetree) ) ) elif args.method == 'list': return FixSNPSelector(snpfile = args.snpfile) else: cexit('ERR: please provide method')
def parse_all(self, maxline=-1): """ this return a full array from the data as such, ensure that the memory is big enough before calling this method """ M = [] for (idx, line) in enumerate(self.infile): if maxline > 0 and idx >= maxline: break tokens = line.strip().split('\t') if len(tokens) != len(self.samples): cexit( 'E: genotype file does not match sample number at line %d' % idx) M.append(self.translate(tokens)) self.parse_position(maxline) return M
def geno2prune(args): lineparser = tabparser.GenotypeLineParser(args) lineparser.set_translator(lineparser.haploid_translator) cerr('I: start parsing genotype file...') outfile = open(args.outfile, 'w') outfile.write(lineparser.get_position_header()) outfile.write('\n') iter_func = { 'chrom': lineparser.parse_chromosomes, 'whole': lineparser.parse_whole, 'gene': lineparser.parse_genes }[args.region] for region in iter_func(): cerr('I: generating genotype array for %s' % region.name) genoarray = allel.GenotypeArray(region.genotypes()) cerr('I: pruning for %s' % region.name) if args.scheme == 1: index = pruner.prune_1(genoarray, args.threshold) elif args.scheme == 2: index = pruner.prune_2(genoarray, region.positions(), args.threshold) elif args.scheme == 21: index = pruner.prune_21(genoarray, region.positions(), args.threshold) elif args.scheme == 3: index = pruner.prune_3(genoarray, region.positions(), args.threshold) else: cexit('E: scheme type undefined!') new_positions = itertools.compress(region.positions(), index) for pos in new_positions: outfile.write('%s\n' % '\t'.join(pos))
def train(args): # load profiles if exists nalt_parser = naltparser.NAltLineParser(args, datatype='nalt') nalt_parser.parse_grouping() group_keys = nalt_parser.group_parser.group_keys region = nalt_parser.parse_whole() samples = nalt_parser.parse_samples() poslines = [line.split() for line in open(args.includepos)][1:] region.filter_poslines(poslines, inplace=True, sort_position=False) haplotypes = region.haplotypes() #import IPython; IPython.embed() cerr('[I - fitting for {}]'.format(args.code)) classifier = lkest.SNPLikelihoodEstimator() classifier.fit(haplotypes, group_keys) profile = classifier.get_profile() profile.positions = region.P profile.code = args.code profile.remark = args.remark try: with open(args.profile, 'rb') as f: profiles = pickle.load(f) except FileNotFoundError: profiles = {} if args.code in profiles and not args.replace: cexit('ERR: cannot replace ') profiles[args.code] = profile.to_dict() with open(args.profile, 'wb') as f: pickle.dump(profiles, f) cerr('[I - profiles saved to {}]'.format(args.profile))
def parse(self): # create a dictionary of groups <> sample_idx if self.groupfile: # this is a YAML/JSON file, open with YAML import yaml grouping = yaml.load(self.groupfile) groups = {} for g in grouping: for s in grouping[g]: groups[s] = g self.group_info = groups elif self.metafile: # this is a tab/comma delimited file metadf = pandas.read_csv(self.metafile, sep=self.delimiter) sample_column, group_column = self.column.split(',') if sample_column.isdigit(): sample_column = metadf.columns[int(sample_column) - 1] if group_column.isdigit(): group_column = metadf.columns[int(group_column) - 1] cerr('[I: reading metafile for column: %s %s]' % (sample_column, group_column)) sampledf = metadf.loc[:, [sample_column, group_column]] groups = {} for i in range(len(sampledf)): r = sampledf.loc[i] groups[r[0]] = r[1] self.group_info = groups else: cexit('E: need groupfile or metafile') return self.group_info
def parse_position_header(self): if not self.posfilename: cexit('E: need --posfile') self.posfile = gzopen(self.posfilename) self.posfile_header = next(self.posfile).strip()
# tabparser from seqpy import cout, cerr, cexit, gzopen from seqpy.cmds import arg_parser from seqpy.core.bioio import grpparser import numpy as np import pandas import attr # requires scikit-allel try: import allel except: cexit('ERR: require properly installed scikit-allel!') def init_argparser(p=None): if p is None: p = arg_parser('Genotype file parser') p = grpparser.init_argparser(p) p.add_argument('--posfile', default=None) p.add_argument('--includepos', default='') p.add_argument('infile') return p
def __init__(self, seed=None, guide_tree=None, min_fst = 0.9): super().__init__(seed) if guide_tree is None: cexit('[E - HierarchicalFSTSelector requires guide tree]') self.guide_tree = guide_tree self.min_fst = min_fst
from seqpy import cout, cerr, cexit from seqpy.cmds import arg_parser from itertools import cycle import numpy as np try: import pandas except: cexit('ERR: rquire proper pandas instalation [pip3 install pandas]') def init_argparser(p=None): if p is None: p = arg_parser('Group file parser') p.add_argument('--groupfile', default='') p.add_argument('--metafile', default='') p.add_argument('--colourfile', default='') p.add_argument('--column', default='1,2') return p # TODO: need to provide mechanism to select colour scheme # 12 colours from ColorBrewer2 colour_list = [ '#1f78b4', '#33a02c', '#e31a1c', '#ff7f00', '#6a3d9a', '#b15928', '#a6cee3', '#b2df8a', '#fb9a99', '#fdbf6f', '#cab2d6', '#ffff99'