def wrapper(args): try: npar = args.noiseparam.strip('[').strip(']').split(',') except: npar = [] nbins = args.nbins # Run funciton if args.i: df = pd.io.parsers.read_csv( args.i,delim_whitespace=True, dtype={'seqs':str,'batch':int}) else: df = pd.io.parsers.read_csv( sys.stdin,delim_whitespace=True, dtype={'seqs':str,'batch':int}) if len(utils.get_column_headers(df)) > 0: raise SortSeqError('Library already sorted!') model_df = io.load_model(args.model) output_df = main( df,model_df,args.noisemodel,npar, nbins,start=args.start,end=args.end) if args.out: outloc = open(args.out,'w') else: outloc = sys.stdout pd.set_option('max_colwidth',int(1e8)) # Validate dataframe for writting output_df = qc.validate_dataset(output_df,fix=True) io.write(output_df,outloc)
def wrapper(args): #validate some of the input arguments qc.validate_input_arguments_for_learn_model( foreground=args.foreground,background=args.background,alpha=args.penalty, modeltype=args.modeltype,learningmethod=args.learningmethod, start=args.start,end=args.end,iteration=args.iteration, burnin=args.burnin,thin=args.thin,pseudocounts=args.pseudocounts,) inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin input_df = io.load_dataset(inloc) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout #pdb.set_trace() output_df = main(input_df,lm=args.learningmethod,\ modeltype=args.modeltype,db=args.db_filename,\ LS_means_std=args.LS_means_std,\ iteration=args.iteration,\ burnin=args.burnin,thin=args.thin,start=args.start,end=args.end,\ runnum=args.runnum,initialize=args.initialize,\ foreground=args.foreground,background=args.background,\ alpha=args.penalty,pseudocounts=args.pseudocounts, verbose=args.verbose) io.write(output_df,outloc)
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout input_df = io.load_dataset(inloc) output_df = main(input_df,bin=args.bin,start=args.start,end=args.end) io.write(output_df,outloc)
def wrapper(args): """ Commandline wrapper for main() """ output_df = main(wtseq=args.wtseq, mutrate=args.mutrate,\ numseq=args.numseqs,dicttype=args.type,tags=args.tags,\ tag_length=args.tag_length) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout io.write(output_df,outloc)
def wrapper(args): inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin dataset_df = io.load_dataset(inloc) model_df = io.load_model(args.model) output_df = main(dataset_df=dataset_df, model_df=model_df,\ left=args.left, right=args.right) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout io.write(output_df,outloc,fast=args.fast)
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout input_df = io.load_dataset(inloc) output_df = main(input_df, start=args.start,end=args.end,\ err=args.err, method=args.method, pseudocount=args.pseudocount) io.write(output_df,outloc)
def test_profile_ct_bincounts(self): """ Test the ability of sortseq_tools.profile_ct to count frequencies """ print '\nIn test_profile_ct_bincounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_ct.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_ct.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_ct.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_ct(df) out_file = self.output_dir+\ 'profile_ct_bin_%s.txt'%description io.write(df,out_file) io.load_profile_ct(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_ct.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_preprocess(self): """ Test the ability of sortseq_tools.preprocess to collate data in multiple sequence files """ print '\nIn test_preprocess...' file_names = glob.glob(self.input_dir+'files_*.txt') # Make sure there are files to test self.assertTrue(len(file_names)>0) for file_name in file_names: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] # If fasta or fastq, assume dna if ('fasta' in file_name) or ('fastq' in file_name): seq_type = 'dna' else: seq_type = None executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type) # If _good_, then preprocess.main should produce a valid df if ('_good' in file_name) or ('_fix' in file_name): try: df = executable() qc.validate_dataset(df) out_file = self.output_dir+'dataset_%s.txt'%description io.write(df,out_file) # Test write io.load_dataset(out_file) # Test loading print 'good.' except: print 'bad (ERROR).' raise # If _bad, then preprocess.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def wrapper(args): """ Wrapper for functions io.load_* and io.write """ # Determine input and output inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout try: # Get load function corresponding to file type func = filetype_to_loadfunc_dict[str(args.type)] # Run load function on input df = func(inloc) # Write df to stdout or to outfile io.write(df,outloc,fast=args.fast) except SortSeqError: raise
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout # Get filelist filelist_df = io.load_filelist(inloc) inloc.close() # Get tagkeys dataframe if provided if args.tagkeys: tagloc = io.validate_file_for_reading(args.tagkeys) tags_df = io.load_tagkey(tagloc) tagloc.close() else: tags_df = None output_df = main(filelist_df,tags_df=tags_df,seq_type=args.seqtype) io.write(output_df,outloc,fast=args.fast)
def generic_test(self,test_name,function_str,file_names,allbad=False): """ Standardizes tests for different dataframe loading functions. The argument function_str must have "%s" where file_name goes. Example: generic_test('test_io_load_tagkey','io.load_tagkey("%s"),file_names)' """ print '\nIn %s...'%test_name # Make sure there are files to test self.assertTrue(len(file_names)>0) # For each file, run test for file_name in file_names: executable = lambda: eval(function_str%file_name) print '\t%s ='%file_name, if not allbad and any([c in file_name for c in \ ['_good','_fix','_badio','_badtype']]): try: df = executable() self.assertTrue(df.shape[0]>=1) # Write df base_filename = file_name.split('/')[-1] io.write(df,self.output_dir+'loaded_'+base_filename) print 'good.' except: print 'bad (ERROR).' raise elif allbad or ('_bad' in file_name): try: self.assertRaises(SortSeqError,executable) print 'bad.' except: print 'good (ERROR).' raise else: print 'what should I expect? (ERROR)' raise print '\tDone.'
def test_profile_info(self): """ Test the ability of sortseq_tools.profile_info to compute mutation rates based on total count values """ print '\nIn test_profile_info...' file_names = glob.glob(self.input_dir+'dataset_*.txt') for err in [True,False]: for file_name in file_names: print '\t%s, err=%s ='%(file_name,str(err)), description = file_name.split('_')[-1].split('.')[0] executable = lambda: \ profile_info.main(io.load_dataset(file_name),err=err) # If good, then profile_info.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_info(df) out_file = self.output_dir+\ 'profile_info_%s_err_%s.txt'%(description,str(err)) io.write(df,out_file) io.load_profile_info(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_info.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_ct_totalcounts(self): """ Test the ability of sortseq_tools.profile_ct to count frequencies based on total count values """ print '\nIn test_profile_ct_totalcounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_ct.main(io.load_dataset(file_name)) # If good, then profile_ct.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_ct(df) out_file = self.output_dir+\ 'profile_ct_total_%s.txt'%description io.write(df,out_file) io.load_profile_ct(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_ct.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_ct_seqslicing(self): """ Test the ability of sortseq_tools.profile_ct to slice sequences properly, and to raise the correct errors """ print '\nIn test_profile_ct_seqslicing...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable_good1 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=2,end=10) executable_good2 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=2) executable_good3 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ end=2) executable_nopro =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=50,end=60) executable_bad1 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=-1) executable_bad2 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ end=100) executable_bad3 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=20,end=10) # If good, then sequences will be valid if 'good' in file_name: try: df = executable_good1() io.write(df,self.output_dir+\ 'profile_ct_splice2-10_%s.txt'%description) executable_good2() executable_good3() self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) if '_pro' in file_name: self.assertRaises(SortSeqError,executable_nopro) else: df = executable_nopro() print 'ok.' except: print 'ok (ERROR).' raise # If bad, then profile_ct.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable_good1) self.assertRaises(SortSeqError,executable_good2) self.assertRaises(SortSeqError,executable_good3) self.assertRaises(SortSeqError,executable_nopro) self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) print 'ok.' except: print 'not ok (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def wrapper(args): """ Wrapper for function for scan_model.main() """ # Prepare input to main model_df = io.load_model(args.model) seqtype, modeltype = qc.get_model_type(model_df) L = model_df.shape[0] if modeltype=='NBR': L += 1 chunksize = args.chunksize if not chunksize>0: raise SortSeqError(\ 'chunksize=%d must be positive'%chunksize) if args.numsites <= 0: raise SortSeqError('numsites=%d must be positive.'%args.numsites) if args.i and args.seq: raise SortSeqError('Cannot use flags -i and -s simultaneously.') # If sequence is provided manually if args.seq: pos_offset=0 contig_str = args.seq # Add a bit on end if circular if args.circular: contig_str += contig_str[:L-1] contig_list = [(contig_str,'manual',pos_offset)] # Otherwise, read sequence from FASTA file else: contig_list = [] inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin for i,record in enumerate(SeqIO.parse(inloc,'fasta')): name = record.name if record.name else 'contig_%d'%i # Split contig up into chunk)size bits full_contig_str = str(record.seq) # Add a bit on end if circular if args.circular: full_contig_str += full_contig_str[:L-1] # Define chunks containing chunksize sites start = 0 end = start+chunksize+L-1 while end < len(full_contig_str): contig_str = full_contig_str[start:end] contig_list.append((contig_str,name,start)) start += chunksize end = start+chunksize+L-1 contig_str = full_contig_str[start:] contig_list.append((contig_str,name,start)) if len(contig_list)==0: raise SortSeqError('No input sequences to read.') # Compute results outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout output_df = main(model_df,contig_list,numsites=args.numsites,\ verbose=args.verbose) # Write df to stdout or to outfile io.write(output_df,outloc,fast=args.fast)