def test_profile_ct_bincounts(self): """ Test the ability of sortseq_tools.profile_ct to count frequencies """ print '\nIn test_profile_ct_bincounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_ct.main(io.load_dataset(file_name),bin=good_bin_num) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_ct.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_ct.main should produce a valid df elif ('_good' in file_name) or ('dataset' in file_name): try: df = executable() qc.validate_profile_ct(df) out_file = self.output_dir+\ 'profile_ct_bin_%s.txt'%description io.write(df,out_file) io.load_profile_ct(out_file) print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_ct.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def main(filelist_df,tags_df=None,indir='./',seq_type=None): """ Merges datasets listed in the filelist_df dataframe """ # Validate filelist qc.validate_filelist(filelist_df) # Read datasets into dictionary indexed by bin number dataset_df_dict = {} for item in filelist_df.iterrows(): # Autodetect fasta, fastq, or text file based on file extension fn = indir+item[1]['file'] b = item[1]['bin'] if re.search(fasta_filename_patterns,fn): df = io.load_dataset(fn,file_type='fasta',seq_type=seq_type) elif re.search(fastq_filename_patterns,fn): df = io.load_dataset(fn,file_type='fastq',seq_type=seq_type) else: df = io.load_dataset(fn,file_type='text',seq_type=seq_type) dataset_df_dict[b] = df # Merge datasets into one out_df = merge_datasets(dataset_df_dict) # Add seqs if given tags_df if not tags_df is None: qc.validate_tagkey(tags_df) tag_col = 'tag' # Test to make sure all tags in dataset are a subset of tags data_tags = set(out_df[tag_col]) all_tags = set(tags_df[tag_col]) if not (data_tags <= all_tags): sys.stderr.write('Some tags probably could not be identified.') # Get name of seq column seq_cols = qc.get_cols_from_df(tags_df, 'seqs') if not len(seq_cols)==1: raise SortSeqError('Multiple seq columns; exaclty 1 required.') seq_col = seq_cols[0] # Set tag to be index column of dataframe tags_df = tags_df.set_index(tag_col) # Add seqs corresponding to each tag tags = out_df[tag_col] seqs = tags_df[seq_col][tags].values if not all([type(x)==str for x in seqs]): raise SortSeqError('Some looked-up seqs are not strings.') out_df[seq_col] = tags_df[seq_col][tags].values qc.validate_dataset(out_df) return out_df
def wrapper(args): #validate some of the input arguments qc.validate_input_arguments_for_learn_model( foreground=args.foreground,background=args.background,alpha=args.penalty, modeltype=args.modeltype,learningmethod=args.learningmethod, start=args.start,end=args.end,iteration=args.iteration, burnin=args.burnin,thin=args.thin,pseudocounts=args.pseudocounts,) inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin input_df = io.load_dataset(inloc) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout #pdb.set_trace() output_df = main(input_df,lm=args.learningmethod,\ modeltype=args.modeltype,db=args.db_filename,\ LS_means_std=args.LS_means_std,\ iteration=args.iteration,\ burnin=args.burnin,thin=args.thin,start=args.start,end=args.end,\ runnum=args.runnum,initialize=args.initialize,\ foreground=args.foreground,background=args.background,\ alpha=args.penalty,pseudocounts=args.pseudocounts, verbose=args.verbose) io.write(output_df,outloc)
def test_preprocess(self): """ Test the ability of sortseq_tools.preprocess to collate data in multiple sequence files """ print '\nIn test_preprocess...' file_names = glob.glob(self.input_dir+'files_*.txt') # Make sure there are files to test self.assertTrue(len(file_names)>0) for file_name in file_names: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] # If fasta or fastq, assume dna if ('fasta' in file_name) or ('fastq' in file_name): seq_type = 'dna' else: seq_type = None executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type) # If _good_, then preprocess.main should produce a valid df if ('_good' in file_name) or ('_fix' in file_name): try: df = executable() qc.validate_dataset(df) out_file = self.output_dir+'dataset_%s.txt'%description io.write(df,out_file) # Test write io.load_dataset(out_file) # Test loading print 'good.' except: print 'bad (ERROR).' raise # If _bad, then preprocess.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout input_df = io.load_dataset(inloc) output_df = main(input_df,bin=args.bin,start=args.start,end=args.end) io.write(output_df,outloc)
def wrapper(args): inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin dataset_df = io.load_dataset(inloc) model_df = io.load_model(args.model) output_df = main(dataset_df=dataset_df, model_df=model_df,\ left=args.left, right=args.right) outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout io.write(output_df,outloc,fast=args.fast)
def wrapper(args): """ Commandline wrapper for main() """ inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout input_df = io.load_dataset(inloc) output_df = main(input_df, start=args.start,end=args.end,\ err=args.err, method=args.method, pseudocount=args.pseudocount) io.write(output_df,outloc)
def wrapper(args): data_df = io.load_dataset(args.dataset) # Take input from standard input or through the -i flag. if args.model: model_df = io.load_model(args.model) else: model_df = io.load_model(sys.stdin) MI,Std = main( data_df,model_df,start=args.start, end=args.end,err=args.err) output_df = pd.DataFrame([MI],columns=['info']) if args.err: output_df = pd.concat([output_df,pd.Series(Std,name='info_err')],axis=1) if args.out: outloc = open(args.out,'w') else: outloc = sys.stdout pd.set_option('max_colwidth',int(1e8)) output_df.to_string( outloc, index=False,col_space=10,float_format=utils.format_string)
def test_profile_info(self): """ Test the ability of sortseq_tools.profile_info to compute mutation rates based on total count values """ print '\nIn test_profile_info...' file_names = glob.glob(self.input_dir+'dataset_*.txt') for err in [True,False]: for file_name in file_names: print '\t%s, err=%s ='%(file_name,str(err)), description = file_name.split('_')[-1].split('.')[0] executable = lambda: \ profile_info.main(io.load_dataset(file_name),err=err) # If good, then profile_info.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_info(df) out_file = self.output_dir+\ 'profile_info_%s_err_%s.txt'%(description,str(err)) io.write(df,out_file) io.load_profile_info(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_info.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_ct_totalcounts(self): """ Test the ability of sortseq_tools.profile_ct to count frequencies based on total count values """ print '\nIn test_profile_ct_totalcounts...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_ct.main(io.load_dataset(file_name)) # If good, then profile_ct.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_ct(df) out_file = self.output_dir+\ 'profile_ct_total_%s.txt'%description io.write(df,out_file) io.load_profile_ct(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_ct.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
import sortseq_tools.io as io import sortseq_tools.qc as qc import re import pdb from sortseq_tools import SortSeqError # Filetypes and corrsponding load functions filetype_to_loadfunc_dict = { 'filelist' : io.load_filelist, 'profile_info' : io.load_profile_info, 'profile_mut' : io.load_profile_mut, 'profile_ct' : io.load_profile_ct, 'profile_freq' : io.load_profile_freq, 'dataset' : io.load_dataset, 'dataset_fasta_dna' : \ lambda f: io.load_dataset(f,file_type='fasta',seq_type='dna'), 'dataset_fasta_rna' : \ lambda f: io.load_dataset(f,file_type='fasta',seq_type='rna'), 'dataset_fasta_protein' : \ lambda f: io.load_dataset(f,file_type='fasta',seq_type='protein'), 'dataset_fastq' : \ lambda f: io.load_dataset(f,file_type='fastq'), 'model' : io.load_model, 'tagkey' : io.load_tagkey, 'meanstd' : io.load_meanstd, 'sitelist' : io.load_sitelist } filetypes = filetype_to_loadfunc_dict.keys() # Define commandline wrapper def wrapper(args):
def test_profile_ct_seqslicing(self): """ Test the ability of sortseq_tools.profile_ct to slice sequences properly, and to raise the correct errors """ print '\nIn test_profile_ct_seqslicing...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable_good1 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=2,end=10) executable_good2 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=2) executable_good3 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ end=2) executable_nopro =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=50,end=60) executable_bad1 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=-1) executable_bad2 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ end=100) executable_bad3 =\ lambda: profile_ct.main(io.load_dataset(file_name),\ start=20,end=10) # If good, then sequences will be valid if 'good' in file_name: try: df = executable_good1() io.write(df,self.output_dir+\ 'profile_ct_splice2-10_%s.txt'%description) executable_good2() executable_good3() self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) if '_pro' in file_name: self.assertRaises(SortSeqError,executable_nopro) else: df = executable_nopro() print 'ok.' except: print 'ok (ERROR).' raise # If bad, then profile_ct.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable_good1) self.assertRaises(SortSeqError,executable_good2) self.assertRaises(SortSeqError,executable_good3) self.assertRaises(SortSeqError,executable_nopro) self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) print 'ok.' except: print 'not ok (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
import pandas as pd import sortseq_tools.qc as qc import sortseq_tools.io as io import os import sortseq_tools.profile_ct as profile_ct import pdb from sortseq_tools import SortSeqError import cProfile import sortseq_tools.profile_info as profile_info import sortseq_tools.learn_model as learn_model import sortseq_tools.predictiveinfo as predictiveinfo import pstats #load in data sets for the test, we will just use the sort-seq crp-wt set df = io.load_dataset('input/mpra.txt') model_df = io.load_model('input/mpra_model') #Profile profile_info #stats_fn = 'Profile_profile_info' #stats_fn_hr = 'Profile_profile_info_hr' #Profile.run('''profile_info.main(df,method='nsb')''',stats_fn) #Reformat and print to human readable profile #p = pstats.Stats(stats_fn,stream=open(stats_fn_hr,'w')) #p.strip_dirs() #p.sort_stats('cumtime') #p.print_stats() df_copy = df.copy() #profile learn_model lm=LS
import pandas as pd import sortseq_tools.qc as qc import sortseq_tools.io as io import os import sortseq_tools.profile_ct as profile_ct import pdb from sortseq_tools import SortSeqError import cProfile import sortseq_tools.profile_info as profile_info import sortseq_tools.learn_model as learn_model import sortseq_tools.predictiveinfo as predictiveinfo import pstats #load in data sets for the test, we will just use the sort-seq crp-wt set df = io.load_dataset('input/dms_1_formatted') model_df = io.load_model('input/dms_1_model') #Profile profile_info #stats_fn = 'Profile_profile_info' #stats_fn_hr = 'Profile_profile_info_hr' #Profile.run('''profile_info.main(df,method='nsb')''',stats_fn) #Reformat and print to human readable profile #p = pstats.Stats(stats_fn,stream=open(stats_fn_hr,'w')) #p.strip_dirs() #p.sort_stats('cumtime') #p.print_stats() df_copy = df.copy() #profile learn_model lm=LS
import pandas as pd import sortseq_tools.qc as qc import sortseq_tools.io as io import os import sortseq_tools.profile_ct as profile_ct import pdb from sortseq_tools import SortSeqError import cProfile import sortseq_tools.profile_info as profile_info import sortseq_tools.learn_model as learn_model import sortseq_tools.predictiveinfo as predictiveinfo import pstats #load in data sets for the test, we will just use the sort-seq crp-wt set df = io.load_dataset('input/rnap-wt-format.txt') model_df = io.load_model('input/rnap_model') #Profile profile_info #stats_fn = 'Profile_profile_info' #stats_fn_hr = 'Profile_profile_info_hr' #Profile.run('''profile_info.main(df,method='nsb')''',stats_fn) #Reformat and print to human readable profile #p = pstats.Stats(stats_fn,stream=open(stats_fn_hr,'w')) #p.strip_dirs() #p.sort_stats('cumtime') #p.print_stats() df_copy = df.copy() #profile learn_model lm=LS