def wrapper(args): #validate some of the input arguments qc.validate_input_arguments_for_learn_model( foreground=args.foreground,background=args.background,alpha=args.penalty, modeltype=args.modeltype,learningmethod=args.learningmethod, start=args.start,end=args.end,iteration=args.iteration, burnin=args.burnin,thin=args.thin,pseudocounts=args.pseudocounts,) inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin input_df = io.load_dataset(inloc) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout #pdb.set_trace() output_df = main(input_df,lm=args.learningmethod,\ modeltype=args.modeltype,db=args.db_filename,\ LS_means_std=args.LS_means_std,\ iteration=args.iteration,\ burnin=args.burnin,thin=args.thin,start=args.start,end=args.end,\ runnum=args.runnum,initialize=args.initialize,\ foreground=args.foreground,background=args.background,\ alpha=args.penalty,pseudocounts=args.pseudocounts, verbose=args.verbose) io.write(output_df,outloc)
def wrapper(args): T_LibCounts = args.totallibcounts T_mRNACounts = args.totalmRNAcounts if T_LibCounts <=0 or T_mRNACounts <= 0: raise SortSeqError('Counts must be greater than zero') model_df = io.load_model(args.model) if args.i: df = pd.io.parsers.read_csv(args.i,delim_whitespace=True) else: df = pd.io.parsers.read_csv(sys.stdin,delim_whitespace=True) #make sure the library is not already sorted if len(utils.get_column_headers(df)) > 0: raise SortSeqError('Library already sorted!') header = df.columns libcounts,expcounts = main(df,model_df,T_LibCounts,T_mRNACounts,start=args.start,end=args.end) #add these counts to input dataframe lc = pd.Series(libcounts,name='ct_0') ec = pd.Series(expcounts,name='ct_1') df['ct_0'] = lc df['ct_1'] = ec df['ct'] = df[['ct_0','ct_1']].sum(axis=1) if args.out: outloc = open(args.out,'w') else: outloc = sys.stdout pd.set_option('max_colwidth',int(1e8)) # Validate dataframe for writting df = qc.validate_dataset(df,fix=True) io.write(df,outloc)
def wrapper(args): try: npar = args.noiseparam.strip("[").strip("]").split(",") except: npar = [] nbins = args.nbins # Run funciton if args.i: df = pd.io.parsers.read_csv(args.i, delim_whitespace=True, dtype={"seqs": str, "batch": int}) else: df = pd.io.parsers.read_csv(sys.stdin, delim_whitespace=True, dtype={"seqs": str, "batch": int}) if len(utils.get_column_headers(df)) > 0: raise SortSeqError("Library already sorted!") model_df = io.load_model(args.model) output_df = main(df, model_df, args.noisemodel, npar, nbins, start=args.start, end=args.end) if args.out: outloc = open(args.out, "w") else: outloc = sys.stdout pd.set_option("max_colwidth", int(1e8)) # Validate dataframe for writting output_df = qc.validate_dataset(output_df, fix=True) io.write(output_df, outloc)
def wrapper(args): """ Commandline wrapper for main() """ output_df = main(wtseq=args.wtseq, mutrate=args.mutrate,\ numseq=args.numseqs,dicttype=args.type,tags=args.tags,\ tag_length=args.tag_length) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout io.write(output_df,outloc)
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout input_df = io.load_dataset(inloc) output_df = main(input_df, bin=args.bin, start=args.start, end=args.end, err=args.err) io.write(output_df, outloc)
def wrapper(args): inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin dataset_df = io.load_dataset(inloc) model_df = io.load_model(args.model) output_df = main(dataset_df=dataset_df, model_df=model_df,\ left=args.left, right=args.right) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout io.write(output_df,outloc,fast=args.fast)
def test_profile_mut_bin_noerr(self): """ Test the ability of mpathic.profile_mut to compute mutation rates """ print '\nIn test_profile_mut_bin_noerr_...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_mut.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_mut.main should produce a valid df elif '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_bin_noerr_%s.txt'%description io.write(df,out_file) # Test writing io.load_profile_mut(out_file) # Test loading print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_freq_bincounts(self): """ Test the ability of mpathic.profile_freq to count frequencies """ print '\nIn test_profile_freq_bincounts...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)' % good_bin_num, # If bad or library, then profile_freq.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError, executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_freq.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_bin_%s.txt'%description io.write(df, out_file) io.load_profile_freq(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_freq.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)' % bad_bin_num, try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_preprocess(self): """ Test the ability of mpathic.preprocess to collate data in multiple sequence files """ print '\nIn test_preprocess...' file_names = glob.glob(self.input_dir + 'files_*.txt') # Make sure there are files to test self.assertTrue(len(file_names) > 0) for file_name in file_names: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] # If fasta or fastq, assume dna if ('fasta' in file_name) or ('fastq' in file_name): seq_type = 'dna' else: seq_type = None executable = lambda: preprocess.main(io.load_filelist(file_name), indir=self.input_dir, seq_type=seq_type) # If _good_, then preprocess.main should produce a valid df if ('_good' in file_name) or ('_fix' in file_name): try: df = executable() qc.validate_dataset(df) out_file = self.output_dir + 'dataset_%s.txt' % description io.write(df, out_file) # Test write io.load_dataset(out_file) # Test loading print 'good.' except: print 'bad (ERROR).' raise # If _bad, then preprocess.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_preprocess(self): """ Test the ability of mpathic.preprocess to collate data in multiple sequence files """ print '\nIn test_preprocess...' file_names = glob.glob(self.input_dir+'files_*.txt') # Make sure there are files to test self.assertTrue(len(file_names)>0) for file_name in file_names: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] # If fasta or fastq, assume dna if ('fasta' in file_name) or ('fastq' in file_name): seq_type = 'dna' else: seq_type = None executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type) # If _good_, then preprocess.main should produce a valid df if ('_good' in file_name) or ('_fix' in file_name): try: df = executable() qc.validate_dataset(df) out_file = self.output_dir+'dataset_%s.txt'%description io.write(df,out_file) # Test write io.load_dataset(out_file) # Test loading print 'good.' except: print 'bad (ERROR).' raise # If _bad, then preprocess.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout if args.bins_df_name: bins_df = pd.io.parsers.read_csv(args.bins_df_name, delim_whitespace=True) else: bins_df = None input_df = io.load_dataset(inloc) output_df = main(input_df, bin=args.bin, start=args.start, end=args.end, bins_df=bins_df, pseudocounts=args.pseudocounts, return_profile=args.return_profile) io.write(output_df, outloc)
def wrapper(args): """ Wrapper for functions io.load_* and io.write """ # Determine input and output inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout try: # Get load function corresponding to file type func = filetype_to_loadfunc_dict[str(args.type)] # Run load function on input df = func(inloc) # Write df to stdout or to outfile io.write(df,outloc,fast=args.fast) except SortSeqError: raise
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout # Get filelist filelist_df = io.load_filelist(inloc) inloc.close() # Get tagkeys dataframe if provided if args.tagkeys: tagloc = io.validate_file_for_reading(args.tagkeys) tags_df = io.load_tagkey(tagloc) tagloc.close() else: tags_df = None output_df = main(filelist_df,tags_df=tags_df,seq_type=args.seqtype) io.write(output_df,outloc,fast=args.fast)
def generic_test(self,test_name,function_str,file_names,allbad=False): """ Standardizes tests for different dataframe loading functions. The argument function_str must have "%s" where file_name goes. Example: generic_test('test_io_load_tagkey','io.load_tagkey("%s"),file_names)' """ print '\nIn %s...'%test_name # Make sure there are files to test self.assertTrue(len(file_names)>0) # For each file, run test for file_name in file_names: executable = lambda: eval(function_str%file_name) print '\t%s ='%file_name, if not allbad and any([c in file_name for c in \ ['_good','_fix','_badio','_badtype']]): try: df = executable() self.assertTrue(df.shape[0]>=1) # Write df base_filename = file_name.split('/')[-1] io.write(df,self.output_dir+'loaded_'+base_filename) print 'good.' except: print 'bad (ERROR).' raise elif allbad or ('_bad' in file_name): try: self.assertRaises(SortSeqError,executable) print 'bad.' except: print 'good (ERROR).' raise else: print 'what should I expect? (ERROR)' raise print '\tDone.'
def generic_test(self, test_name, function_str, file_names, allbad=False): """ Standardizes tests for different dataframe loading functions. The argument function_str must have "%s" where file_name goes. Example: generic_test('test_io_load_tagkey','io.load_tagkey("%s"),file_names)' """ print '\nIn %s...' % test_name # Make sure there are files to test self.assertTrue(len(file_names) > 0) # For each file, run test for file_name in file_names: executable = lambda: eval(function_str % file_name) print '\t%s =' % file_name, if not allbad and any([c in file_name for c in \ ['_good','_fix','_badio','_badtype']]): try: df = executable() self.assertTrue(df.shape[0] >= 1) # Write df base_filename = file_name.split('/')[-1] io.write(df, self.output_dir + 'loaded_' + base_filename) print 'good.' except: print 'bad (ERROR).' raise elif allbad or ('_bad' in file_name): try: self.assertRaises(SortSeqError, executable) print 'bad.' except: print 'good (ERROR).' raise else: print 'what should I expect? (ERROR)' raise print '\tDone.'
def test_profile_info(self): """ Test the ability of mpathic.profile_info to compute mutation rates based on total count values """ print '\nIn test_profile_info...' file_names = glob.glob(self.input_dir+'dataset_*.txt') for err in [True,False]: for file_name in file_names: print '\t%s, err=%s ='%(file_name,str(err)), description = file_name.split('_')[-1].split('.')[0] executable = lambda: \ profile_info.main(io.load_dataset(file_name),err=err) # If good, then profile_info.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_info(df) out_file = self.output_dir+\ 'profile_info_%s_err_%s.txt'%(description,str(err)) io.write(df,out_file) io.load_profile_info(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_info.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_info(self): """ Test the ability of mpathic.profile_info to compute mutation rates based on total count values """ print '\nIn test_profile_info...' file_names = glob.glob(self.input_dir + 'dataset_*.txt') for err in [True, False]: for file_name in file_names: print '\t%s, err=%s =' % (file_name, str(err)), description = file_name.split('_')[-1].split('.')[0] executable = lambda: \ profile_info.main(io.load_dataset(file_name),err=err) # If good, then profile_info.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_info(df) out_file = self.output_dir+\ 'profile_info_%s_err_%s.txt'%(description,str(err)) io.write(df, out_file) io.load_profile_info(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_info.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_freq_totalcounts(self): """ Test the ability of mpathic.profile_freq to compute frequencies based on total count values """ print '\nIn test_profile_freq_totalcounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_freq.main(io.load_dataset(file_name)) # If good, then profile_freq.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_total_%s.txt'%description io.write(df,out_file) io.load_profile_freq(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_freq.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_freq_totalcounts(self): """ Test the ability of mpathic.profile_freq to compute frequencies based on total count values """ print '\nIn test_profile_freq_totalcounts...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_freq.main(io.load_dataset(file_name)) # If good, then profile_freq.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_freq(df) out_file = self.output_dir+\ 'profile_freq_total_%s.txt'%description io.write(df, out_file) io.load_profile_freq(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_freq.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_freq_seqslicing(self): """ Test the ability of mpathic.profile_freq to slice sequences properly, and to raise the correct errors """ print '\nIn test_profile_freq_seqslicing...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable_good1 =\ lambda: profile_freq.main(io.load_dataset(file_name),\ start=2,end=10) executable_good2 =\ lambda: profile_freq.main(io.load_dataset(file_name),\ start=2) executable_good3 =\ lambda: profile_freq.main(io.load_dataset(file_name),\ end=2) executable_nopro =\ lambda: profile_freq.main(io.load_dataset(file_name),\ start=50,end=60) executable_bad1 =\ lambda: profile_freq.main(io.load_dataset(file_name),\ start=-1) executable_bad2 =\ lambda: profile_freq.main(io.load_dataset(file_name),\ end=100) executable_bad3 =\ lambda: profile_freq.main(io.load_dataset(file_name),\ start=20,end=10) # If good, then sequences will be valid if 'good' in file_name: try: df = executable_good1() io.write(df,self.output_dir+\ 'profile_freq_splice2-10_%s.txt'%description) executable_good2() executable_good3() self.assertRaises(SortSeqError, executable_bad1) self.assertRaises(SortSeqError, executable_bad2) self.assertRaises(SortSeqError, executable_bad3) if '_pro' in file_name: self.assertRaises(SortSeqError, executable_nopro) else: df = executable_nopro() print 'ok.' except: print 'ok (ERROR).' raise # If bad, then profile_freq.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable_good1) self.assertRaises(SortSeqError, executable_good2) self.assertRaises(SortSeqError, executable_good3) self.assertRaises(SortSeqError, executable_nopro) self.assertRaises(SortSeqError, executable_bad1) self.assertRaises(SortSeqError, executable_bad2) self.assertRaises(SortSeqError, executable_bad3) print 'ok.' except: print 'not ok (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_mut_seqslicing(self): """ Test the ability of mpathic.profile_mut to slice sequences properly, and to raise the correct errors """ print '\nIn test_profile_mut_seqslicing...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable_good1 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=2,end=10) executable_good2 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=10) executable_good3 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=2) executable_nopro =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=50,end=60) executable_bad1 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=-1) executable_bad2 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=100) executable_bad3 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=20,end=10) # If good, then sequences will be valid if '_good' in file_name: try: df = executable_good1() io.write(df,self.output_dir+\ 'profile_mut_splice2-10_%s.txt'%description) executable_good2() executable_good3() self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) if '_pro' in file_name: self.assertRaises(SortSeqError,executable_nopro) else: df = executable_nopro() print 'ok.' except: print 'ok (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable_good1) self.assertRaises(SortSeqError,executable_good2) self.assertRaises(SortSeqError,executable_good3) self.assertRaises(SortSeqError,executable_nopro) self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) print 'ok.' except: print 'not ok (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def wrapper(args): """ Wrapper for function for scan_model.main() """ # Prepare input to main model_df = io.load_model(args.model) seqtype, modeltype = qc.get_model_type(model_df) L = model_df.shape[0] if modeltype == "NBR": L += 1 chunksize = args.chunksize if not chunksize > 0: raise SortSeqError("chunksize=%d must be positive" % chunksize) if args.numsites <= 0: raise SortSeqError("numsites=%d must be positive." % args.numsites) if args.i and args.seq: raise SortSeqError("Cannot use flags -i and -s simultaneously.") # If sequence is provided manually if args.seq: pos_offset = 0 contig_str = args.seq # Add a bit on end if circular if args.circular: contig_str += contig_str[: L - 1] contig_list = [(contig_str, "manual", pos_offset)] # Otherwise, read sequence from FASTA file else: contig_list = [] inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin for i, record in enumerate(SeqIO.parse(inloc, "fasta")): name = record.name if record.name else "contig_%d" % i # Split contig up into chunk)size bits full_contig_str = str(record.seq) # Add a bit on end if circular if args.circular: full_contig_str += full_contig_str[: L - 1] # Define chunks containing chunksize sites start = 0 end = start + chunksize + L - 1 while end < len(full_contig_str): contig_str = full_contig_str[start:end] contig_list.append((contig_str, name, start)) start += chunksize end = start + chunksize + L - 1 contig_str = full_contig_str[start:] contig_list.append((contig_str, name, start)) if len(contig_list) == 0: raise SortSeqError("No input sequences to read.") # Compute results outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout output_df = main(model_df, contig_list, numsites=args.numsites, verbose=args.verbose) # Write df to stdout or to outfile io.write(output_df, outloc, fast=args.fast)