Ejemplo n.º 1
0
    def test_profile_ct_bincounts(self):
        """ Test the ability of sortseq_tools.profile_ct to count frequencies
        """

        print '\nIn test_profile_ct_bincounts...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        good_bin_num = 2
        bad_bin_num = 5
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda:\
                profile_ct.main(io.load_dataset(file_name),bin=good_bin_num)
            print '(bin=%d)'%good_bin_num,

            # If bad or library, then profile_ct.main should raise SortSeqError
            if ('_bad' in file_name) or ('library' in file_name):
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype,',
                except:
                    print 'good (ERROR).'
                    raise

            # If good, then profile_ct.main should produce a valid df
            elif ('_good' in file_name) or ('dataset' in file_name):
                try:
                    df = executable()
                    qc.validate_profile_ct(df)
                    out_file = self.output_dir+\
                        'profile_ct_bin_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_ct(out_file)
                    print 'good,',

                except:
                    print 'bad (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')

            # Should always raise an error if bin num is too large
            executable = lambda:\
                profile_ct.main(io.load_dataset(file_name),bin=bad_bin_num)
            print '(bin=%d)'%bad_bin_num,
            try:
                self.assertRaises(SortSeqError,executable)
                print 'badtype.'
            except:
                print 'good (ERROR).'
                raise
Ejemplo n.º 2
0
def main(filelist_df,tags_df=None,indir='./',seq_type=None):
    """ Merges datasets listed in the filelist_df dataframe
    """

    # Validate filelist
    qc.validate_filelist(filelist_df)

    # Read datasets into dictionary indexed by bin number
    dataset_df_dict = {}
    for item in filelist_df.iterrows():
        # Autodetect fasta, fastq, or text file based on file extension
        fn = indir+item[1]['file']
        b = item[1]['bin']
        if re.search(fasta_filename_patterns,fn):
            df = io.load_dataset(fn,file_type='fasta',seq_type=seq_type)
        elif re.search(fastq_filename_patterns,fn):
            df = io.load_dataset(fn,file_type='fastq',seq_type=seq_type)
        else:
            df = io.load_dataset(fn,file_type='text',seq_type=seq_type)
        dataset_df_dict[b] = df

    # Merge datasets into one
    out_df = merge_datasets(dataset_df_dict)

    # Add seqs if given tags_df
    if not tags_df is None:
        qc.validate_tagkey(tags_df)
        tag_col = 'tag'

        # Test to make sure all tags in dataset are a subset of tags
        data_tags = set(out_df[tag_col])
        all_tags = set(tags_df[tag_col])
        if not (data_tags <= all_tags):
            sys.stderr.write('Some tags probably could not be identified.')

        # Get name of seq column       
        seq_cols = qc.get_cols_from_df(tags_df, 'seqs')
        if not len(seq_cols)==1:
            raise SortSeqError('Multiple seq columns; exaclty 1 required.')
        seq_col = seq_cols[0]

        # Set tag to be index column of dataframe
        tags_df = tags_df.set_index(tag_col)

        # Add seqs corresponding to each tag
        tags = out_df[tag_col]
        seqs = tags_df[seq_col][tags].values
        if not all([type(x)==str for x in seqs]):
            raise SortSeqError('Some looked-up seqs are not strings.')
        out_df[seq_col] = tags_df[seq_col][tags].values

    qc.validate_dataset(out_df)
    return out_df
Ejemplo n.º 3
0
def wrapper(args):

    #validate some of the input arguments
    qc.validate_input_arguments_for_learn_model(
        foreground=args.foreground,background=args.background,alpha=args.penalty,
        modeltype=args.modeltype,learningmethod=args.learningmethod,
        start=args.start,end=args.end,iteration=args.iteration,
        burnin=args.burnin,thin=args.thin,pseudocounts=args.pseudocounts,)

    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    input_df = io.load_dataset(inloc)
    
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    #pdb.set_trace()

    output_df = main(input_df,lm=args.learningmethod,\
        modeltype=args.modeltype,db=args.db_filename,\
        LS_means_std=args.LS_means_std,\
        iteration=args.iteration,\
        burnin=args.burnin,thin=args.thin,start=args.start,end=args.end,\
        runnum=args.runnum,initialize=args.initialize,\
        foreground=args.foreground,background=args.background,\
        alpha=args.penalty,pseudocounts=args.pseudocounts,
        verbose=args.verbose)

    io.write(output_df,outloc)
Ejemplo n.º 4
0
    def test_preprocess(self):
        """ Test the ability of sortseq_tools.preprocess to collate data in multiple sequence files
        """

        print '\nIn test_preprocess...'
        file_names = glob.glob(self.input_dir+'files_*.txt')

        # Make sure there are files to test
        self.assertTrue(len(file_names)>0)

        for file_name in file_names:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]

            # If fasta or fastq, assume dna
            if ('fasta' in file_name) or ('fastq' in file_name):
                seq_type = 'dna'
            else:
                seq_type = None

            executable = lambda: preprocess.main(io.load_filelist(file_name),indir=self.input_dir, seq_type=seq_type)

            # If _good_, then preprocess.main should produce a valid df
            if ('_good' in file_name) or ('_fix' in file_name):
                try:
                    df = executable()
                    qc.validate_dataset(df)
                    out_file = self.output_dir+'dataset_%s.txt'%description
                    io.write(df,out_file)       # Test write
                    io.load_dataset(out_file)   # Test loading
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If _bad, then preprocess.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Ejemplo n.º 5
0
def wrapper(args):
    """ Commandline wrapper for main()
    """ 
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    input_df = io.load_dataset(inloc)
    output_df = main(input_df,bin=args.bin,start=args.start,end=args.end)
    io.write(output_df,outloc)
Ejemplo n.º 6
0
def wrapper(args):
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    dataset_df = io.load_dataset(inloc)
    model_df = io.load_model(args.model)
    output_df = main(dataset_df=dataset_df, model_df=model_df,\
        left=args.left, right=args.right)
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    io.write(output_df,outloc,fast=args.fast)
Ejemplo n.º 7
0
def wrapper(args):
    """ Commandline wrapper for main()
    """
    inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    input_df = io.load_dataset(inloc)
    output_df = main(input_df, start=args.start,end=args.end,\
        err=args.err, method=args.method, pseudocount=args.pseudocount)
    io.write(output_df,outloc)
Ejemplo n.º 8
0
def wrapper(args):
    
    data_df = io.load_dataset(args.dataset)    	    
    # Take input from standard input or through the -i flag.
    if args.model:
        model_df = io.load_model(args.model)
    else:
        model_df = io.load_model(sys.stdin)
    MI,Std = main(
        data_df,model_df,start=args.start,
        end=args.end,err=args.err)
    output_df = pd.DataFrame([MI],columns=['info'])
    if args.err:
        output_df = pd.concat([output_df,pd.Series(Std,name='info_err')],axis=1)
  
    if args.out:
        outloc = open(args.out,'w')
    else:
        outloc = sys.stdout
    pd.set_option('max_colwidth',int(1e8))
    output_df.to_string(
        outloc, index=False,col_space=10,float_format=utils.format_string)
Ejemplo n.º 9
0
    def test_profile_info(self):
        """ Test the ability of sortseq_tools.profile_info to compute mutation rates based on total count values
        """

        print '\nIn test_profile_info...'
        file_names = glob.glob(self.input_dir+'dataset_*.txt')
        for err in [True,False]:
            for file_name in file_names:
                print '\t%s, err=%s ='%(file_name,str(err)),
                description = file_name.split('_')[-1].split('.')[0]
                executable = lambda: \
                    profile_info.main(io.load_dataset(file_name),err=err)

                # If good, then profile_info.main should produce a valid df
                if '_good' in file_name:
                    try:
                        df = executable()
                        qc.validate_profile_info(df)
                        out_file = self.output_dir+\
                            'profile_info_%s_err_%s.txt'%(description,str(err))
                        io.write(df,out_file)
                        io.load_profile_info(out_file)
                        print 'good.'
                    except:
                        print 'bad (ERROR).'
                        raise

                # If bad, then profile_info.main should raise SortSeqError
                elif '_bad' in file_name:
                    try:
                        self.assertRaises(SortSeqError,executable)
                        print 'badtype.'
                    except:
                        print 'good (ERROR).'
                        raise

                # There are no other options
                else:
                    raise SortSeqError('Unrecognized class of file_name.')
Ejemplo n.º 10
0
    def test_profile_ct_totalcounts(self):
        """ Test the ability of sortseq_tools.profile_ct to count frequencies based on total count values
        """

        print '\nIn test_profile_ct_totalcounts...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda: profile_ct.main(io.load_dataset(file_name))

            # If good, then profile_ct.main should produce a valid df
            if '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_ct(df)
                    out_file = self.output_dir+\
                        'profile_ct_total_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_ct(out_file)
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If bad, then profile_ct.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Ejemplo n.º 11
0
import sortseq_tools.io as io
import sortseq_tools.qc as qc
import re
import pdb
from sortseq_tools import SortSeqError

# Filetypes and corrsponding load functions
filetype_to_loadfunc_dict = {
    'filelist'      :   io.load_filelist,
    'profile_info'  :   io.load_profile_info,
    'profile_mut'   :   io.load_profile_mut,
    'profile_ct'    :   io.load_profile_ct,
    'profile_freq'  :   io.load_profile_freq,
    'dataset'       :   io.load_dataset,
    'dataset_fasta_dna'     :   \
        lambda f: io.load_dataset(f,file_type='fasta',seq_type='dna'),
    'dataset_fasta_rna'     :   \
        lambda f: io.load_dataset(f,file_type='fasta',seq_type='rna'),
    'dataset_fasta_protein' :   \
        lambda f: io.load_dataset(f,file_type='fasta',seq_type='protein'),
    'dataset_fastq' :   \
        lambda f: io.load_dataset(f,file_type='fastq'),
    'model'         :   io.load_model,
    'tagkey'        :   io.load_tagkey,
    'meanstd'       :   io.load_meanstd,
    'sitelist'      :   io.load_sitelist
}
filetypes = filetype_to_loadfunc_dict.keys()

# Define commandline wrapper
def wrapper(args):
Ejemplo n.º 12
0
    def test_profile_ct_seqslicing(self):
        """ Test the ability of sortseq_tools.profile_ct to slice sequences properly, and to raise the correct errors
        """

        print '\nIn test_profile_ct_seqslicing...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable_good1 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=2,end=10)
            executable_good2 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=2)
            executable_good3 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    end=2)
            executable_nopro =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=50,end=60)
            executable_bad1 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=-1)
            executable_bad2 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    end=100)
            executable_bad3 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=20,end=10)

            # If good, then sequences will be valid
            if 'good' in file_name:
                try:
                    df = executable_good1()
                    io.write(df,self.output_dir+\
                        'profile_ct_splice2-10_%s.txt'%description)
                    executable_good2()
                    executable_good3()
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    if '_pro' in file_name:
                        self.assertRaises(SortSeqError,executable_nopro)
                    else:
                        df = executable_nopro()
                    print 'ok.'
                except:
                    print 'ok (ERROR).'
                    raise

            # If bad, then profile_ct.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable_good1)
                    self.assertRaises(SortSeqError,executable_good2)
                    self.assertRaises(SortSeqError,executable_good3)
                    self.assertRaises(SortSeqError,executable_nopro)
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    print 'ok.'
                except:
                    print 'not ok (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
import pandas as pd
import sortseq_tools.qc as qc
import sortseq_tools.io as io
import os
import sortseq_tools.profile_ct as profile_ct
import pdb
from sortseq_tools import SortSeqError
import cProfile
import sortseq_tools.profile_info as profile_info
import sortseq_tools.learn_model as learn_model
import sortseq_tools.predictiveinfo as predictiveinfo
import pstats

#load in data sets for the test, we will just use the sort-seq crp-wt set

df = io.load_dataset('input/mpra.txt')
model_df = io.load_model('input/mpra_model')

#Profile profile_info
#stats_fn = 'Profile_profile_info'
#stats_fn_hr = 'Profile_profile_info_hr'
#Profile.run('''profile_info.main(df,method='nsb')''',stats_fn)

#Reformat and print to human readable profile
#p = pstats.Stats(stats_fn,stream=open(stats_fn_hr,'w'))
#p.strip_dirs()
#p.sort_stats('cumtime')
#p.print_stats()

df_copy = df.copy()
#profile learn_model lm=LS
Ejemplo n.º 14
0
import pandas as pd
import sortseq_tools.qc as qc
import sortseq_tools.io as io
import os
import sortseq_tools.profile_ct as profile_ct
import pdb
from sortseq_tools import SortSeqError
import cProfile
import sortseq_tools.profile_info as profile_info
import sortseq_tools.learn_model as learn_model
import sortseq_tools.predictiveinfo as predictiveinfo
import pstats

#load in data sets for the test, we will just use the sort-seq crp-wt set

df = io.load_dataset('input/dms_1_formatted')
model_df = io.load_model('input/dms_1_model')

#Profile profile_info
#stats_fn = 'Profile_profile_info'
#stats_fn_hr = 'Profile_profile_info_hr'
#Profile.run('''profile_info.main(df,method='nsb')''',stats_fn)

#Reformat and print to human readable profile
#p = pstats.Stats(stats_fn,stream=open(stats_fn_hr,'w'))
#p.strip_dirs()
#p.sort_stats('cumtime')
#p.print_stats()

df_copy = df.copy()
#profile learn_model lm=LS
Ejemplo n.º 15
0
import pandas as pd
import sortseq_tools.qc as qc
import sortseq_tools.io as io
import os
import sortseq_tools.profile_ct as profile_ct
import pdb
from sortseq_tools import SortSeqError
import cProfile
import sortseq_tools.profile_info as profile_info
import sortseq_tools.learn_model as learn_model
import sortseq_tools.predictiveinfo as predictiveinfo
import pstats

#load in data sets for the test, we will just use the sort-seq crp-wt set

df = io.load_dataset('input/rnap-wt-format.txt')
model_df = io.load_model('input/rnap_model')

#Profile profile_info
#stats_fn = 'Profile_profile_info'
#stats_fn_hr = 'Profile_profile_info_hr'
#Profile.run('''profile_info.main(df,method='nsb')''',stats_fn)

#Reformat and print to human readable profile
#p = pstats.Stats(stats_fn,stream=open(stats_fn_hr,'w'))
#p.strip_dirs()
#p.sort_stats('cumtime')
#p.print_stats()

df_copy = df.copy()
#profile learn_model lm=LS