def combine_asr_tables(output_files,verbose=False):
    """ Combine all tables coming from asr output. Cuts 2nd column out and joins them together into single table.
    Assumes all output files have same row identifiers and that these are in the same order.
    """

    #Going to store an array of arrays here
    combined_table=[]

    #load in the first column (containing row ids). File doesn't matter since they should all have identical first columns.
    table=LoadTable(filename=output_files[0],header=True,sep='\t')
    row_ids = table.getRawData(columns=[table.Header[0]])
    combined_table.append([table.Header[0]])
    for row_id in row_ids:
        combined_table.append([row_id])

    #Now add the rest of the files to the table
    for i,output_file in enumerate(output_files):
        if verbose:
            print "Combining file {0} of {1}: {2}".format(i,len(output_files),output_file)
        #pull out the second column (first column with actual preditions)
        table=LoadTable(filename=output_file,header=True,sep='\t')
        predictions = table.getRawData(columns=[table.Header[1]])

        #Add the header for our column to the list of headers
        combined_table[0].append(table.Header[1])

        #Add rest of values in the column
        j=1
        for prediction in predictions:
            combined_table[j].append(prediction)
            j+=1

    return combined_table
Esempio n. 2
0
def ace_for_picrust(tree_path,trait_table_path,method='pic',HALT_EXEC=False):
    '''Runs the Ace application controller given path of tree and trait table and returns a Table'''
    #initialize Ace app controller
    ace=Ace(HALT_EXEC=HALT_EXEC)

    tmp_output_count_path=get_tmp_filename()
    tmp_output_prob_path=get_tmp_filename()

    #quote file names
    tree_path='"{0}"'.format(tree_path)
    trait_table_path='"{0}"'.format(trait_table_path)
    
    as_string = " ".join([tree_path,trait_table_path,method,tmp_output_count_path,tmp_output_prob_path])
    #Run ace here
    result = ace(data=as_string)

    #Load the output into Table objects
    try:
        asr_table=LoadTable(filename=tmp_output_count_path,header=True,sep='\t')
    except IOError:
        raise RuntimeError,\
         ("R reported an error on stderr:"
          " %s" % "\n".join(result["StdErr"].readlines()))
    
    asr_prob_table=LoadTable(filename=tmp_output_prob_path,header=True,sep='\t')

    #Remove tmp files
    remove(tmp_output_count_path)
    remove(tmp_output_prob_path)

    return asr_table,asr_prob_table
Esempio n. 3
0
def make_bed_entries(mapped_read_path,
                     chrom_number,
                     feature_name,
                     output_bed_file,
                     max_read_length=None,
                     count_max_length=False,
                     strand=NULL_STRAND,
                     sep='\t',
                     is_sorted=True,
                     ui=None):
    """ translates a numpy array of mapped chromosome positions into a BED file.

    Arguments:
        - mapped_read_path: path to table containing read coordinates, frequency data
        - output_write_path: path to BED-6 format output file
        - max_read_length: maximum length of a read length
        - count_max_length: if max_read_length provided, all mapped seqs set
          to this length
        - strand: only reads from specified strand are added. Default is both.
        - sep: the delimiter in the read coordinates file
        - is_sorted: whether the read file is already sorted
    """

    data = LoadTable(mapped_read_path, sep=sep)
    assert list(data.Header) == ['start', 'length', 'strand', 'freq'],\
    "mapped read Table header doesn't match expected"

    if not is_sorted:
        data = data.sorted(columns='start')

    if count_max_length:
        assert max_read_length, 'must specify max_read_length to use'\
                                ' count_max_length'
    data = data.array.astype(int)
    total_data = data.shape[0]

    chrom = "chr%s" % (chrom_number)
    score = 30
    name = feature_name
    for i, row in enumerate(data):
        if i % 10 == 0:
            ui.display(
                'Converting mapped locations [%d / %d]' % (i, total_data),
                i / total_data)
        start = row[0]  # move from 1-based BWA calls to 0-based BED
        end = start + row[
            1]  # end is 1 beyond actual mapped end, so length = end - start
        strand = ['+', '-'][(row[2] > 0) == 0]  # 1,-1 converted to '+','-'
        #print "\n Row values: %d\t%d\t%d\t%d\n" % (row[0], row[1], row[2], row[3])
        for counts in range(row[3]):
            bed_string = '%s\t%s\t%s\t%s\t%s\t%s\n' % \
                         (chrom, start, end, name, score, strand)
            #print bed_string
            output_bed_file.write(bed_string)

    print "Converted %d mapped locations" % (total_data)
Esempio n. 4
0
    def test_export_table(self):
        """correctly generates table file"""
        orig_data = dict(counts=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                         ranks=[0, 1, 2, 3, 4],
                         labels=['a', 'b', 'c', 'd', 'e'])
        coll = RegionCollection(**orig_data)

        expect = coll.toTable().getRawData()
        coll.writeToFile('testdata', as_table=True)
        got = LoadTable('testdata', sep='\t')
        self.assertEqual(got.getRawData(), expect)
        remove_files(['testdata'], error_on_missing=False)
Esempio n. 5
0
    def toTable(self):
        """builds a tab separated table for writeToFile"""
        header = []

        if self.labels is not None:
            labels = self.labels.tolist()
            header.append('gene')

        if self.ranks is not None:
            ranks = self.ranks.tolist()
            header.append('rank')

        save_data = self.counts.tolist()

        n_cols = len(save_data[0])
        window = n_cols/2
        posn = range(-window, window)
        header.extend(map(str, posn))

        for i in range(len(save_data)):
            if self.ranks is not None:
                save_data[i].insert(0, ranks[i])

            if self.labels is not None:
                save_data[i].insert(0, labels[i])
        
        out_table = LoadTable(header=header, rows=save_data, sep='\t')
        return out_table
Esempio n. 6
0
    def Pseudovalues(self):
        """Return a table of the Pseudovalues"""

        # if the statistics haven't been run yet.
        if self._pseudovalues is None:
            self.jackknife()

        # detailed table
        title = 'Pseudovalues'
        rows = []
        for index in range(self.n):
            row = [index]
            pseudovalues = self._pseudovalues[index]
            try:
                for value in pseudovalues:
                    row.append(value)
            except TypeError:
                row.append(pseudovalues)
            rows.append(row)

        header = ['i']
        pseudovalues = self._pseudovalues[0]

        try:
            num_datasets = len(pseudovalues)
            for i in range(num_datasets):
                header.append('Pseudovalue_%s-i' % i)
        except TypeError:
            header.append('Pseudovalue-i')

        return LoadTable(rows=rows, header=header, title=title)
Esempio n. 7
0
    def SubSampleStats(self):
        """Return a table of the sub-sample statistics"""

        # if the statistics haven't been run yet.
        if self._subset_statistics is None:
            self.jackknife()

        # generate table
        title = 'Subsample Stats'
        rows = []
        for index in range(self.n):
            row = []
            row.append(index)
            subset_statistics = self._subset_statistics[index]
            try:
                for value in subset_statistics:
                    row.append(value)
            except TypeError:
                row.append(subset_statistics)
            rows.append(row)

        header = ['i']
        subset_stats = self._subset_statistics[0]

        try:
            num_datasets = len(subset_stats)
            for i in range(num_datasets):
                header.append('Stat_%s-i' % i)
        except TypeError:
            header.append('Stat-i')

        return LoadTable(rows=rows, header=header, title=title)
Esempio n. 8
0
def main():
    rr = RunRecord('add_expression_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Add Expression to DB')
    session = db_query.make_session(args.db_path)

    name = args.name
    description = args.description
    ref_file = args.expression_data
    sample_type = args.sample_type

    # Check that Sample and Reference File are both unique
    if name in db_query.get_sample_entries(session):
        rr.dieOnCritical('Sample name already exists', name)
    if ref_file in db_query.get_reffile_entries(session,
                                                reffile_name=ref_file):
        rr.dieOnCritical('ReferenceFile already loaded', ref_file)

    if sample_types[sample_type] == sample_types['abs_expr']:
        expr_table = gene_expr_to_table(
            args.expression_data,
            stable_id_label=args.gene_id_heading,
            probeset_label=args.probeset_heading,
            exp_label=args.expression_heading,
            allow_probeset_many_gene=args.allow_probeset_many_gene,
            validate=True,
            sep=args.sep)

    elif sample_types[sample_type] == sample_types['diff_expr']:
        # validation breaks with some of Rohan's diff files
        # he's included all probesets but only the mean score, once.
        expr_table = gene_expr_diff_to_table(
            args.expression_data,
            stable_id_label=args.gene_id_heading,
            probeset_label=args.probeset_heading,
            exp_label=args.expression_heading,
            sig_label=args.significance_heading,
            pval_label=args.p_value_heading,
            allow_probeset_many_gene=args.allow_probeset_many_gene,
            validate=False,
            sep=args.sep)
    elif sample_types[sample_type] == sample_types['target_genes']:
        expr_table = LoadTable(args.expression_data, sep=args.sep)

    else:
        rr.dieOnCritical('Unknown sample type', args.sample_type)

    success = add_data(session,
                       name,
                       description,
                       args.expression_data,
                       expr_table,
                       sample_type=args.sample_type,
                       reffile1=args.reffile1,
                       reffile2=args.reffile2)

    rr.addInfo(name + ' added to DB', success)
    rr.display()
Esempio n. 9
0
def PslToTable(data):
    """converts psl format to a table"""
    parser = MinimalPslParser(data)
    version = parser.next()
    header = parser.next()
    rows = [row for row in parser]
    table = LoadTable(header=header, rows=rows, title=version)
    return table
Esempio n. 10
0
    def _load(self, filename):
        """loads attributes from a gzipped, .npy data structure or a tab delimited
        cogent table"""
        try:
            infile = gzip.GzipFile(filename, 'r')
            data = numpy.load(infile)
            infile.close()

            # remember numpy.load() returns and array object
            # numpy.load().tolist() returns a dict ... wtf !
            data = data.tolist()
            for name in data:
                value = data[name]
                self.__dict__[name] = value
                if (name == 'ranks' or name == 'counts') and value is not None:
                    self.__dict__[name] = value.astype(float)
                if name == 'labels' and value is not None:
                    self.__dict__[name] = value.astype(str)

        except Exception as e:
            print "Trying to load from table"
            data = LoadTable(filename, sep='\t')

            # convert table to collection here.
            ls = []
            rs = []
            cs = []
            for row in data.getRawData():
                l =numpy.unicode(row[0])
                r = numpy.float(row[1])
                c = numpy.array(row[2:len(row)], dtype=numpy.float32)

                ls.append(l)
                rs.append(r)
                cs.append(c)

            self.labels = numpy.array(ls)
            self.ranks = numpy.array(rs)
            self.counts = numpy.array(cs)
        
        self.N = self.counts.shape[0]
Esempio n. 11
0
def update_trait_dict_from_file(table_file, header = [],input_sep="\t"):
    """Update a trait dictionary from a table file

    table_file --  File name of a trait table.
    
    The first line should be a header line, with column headers equal to trait 
    (e.g. gene family) names, while the row headers should be organism 
    ids that match the tree.

    trait_dict -- a dictionary of traits, keyed by organism.  
    Items in trait dict will be overwritten if present.
    """ 
    #First line should be headers
    table=LoadTable(filename=table_file,header=True,sep=input_sep)

    #do some extra stuff to match columns if a header is provided
    if header:
        #error checking to make sure traits in ASR table are a subset of traits in genome table
        if set(header) != set(table.Header[1:]):
            if set(header).issubset(set(table.Header[1:])):
                diff_traits = set(table.Header[1:]).difference(set(header))
                warn("Missing traits in given ASR table with labels:{0}. Predictions will not be produced for these traits.".format(list(diff_traits))) 
            else:
                raise RuntimeError("Given ASR trait table contains one or more traits that do not exist in given genome trait table. Predictions can not be made.")
            
        #Note: keep the first column heading at the beginning not sorted (this is the name for the row ids
        sorted_header=[table.Header[0]]
        sorted_header.extend(header)
        table = table.getColumns(sorted_header)
    
    traits = {}
    for fields in table: 
        try:
            traits[fields[0]] = map(float,fields[1:])
        except ValueError:
            err_str =\
                    "Could not convert trait table fields:'%s' to float" %(fields[1:])
            raise ValueError(err_str)
       
    return table.Header[1:],traits
def combine_asr_tables(output_files,verbose=False):
    """ Combine all tables coming from asr output. Cuts 2nd column out and joins them together into single table.
    Assumes all output files have same row identifiers and that these are in the same order.
    """

    #Going to store an array of arrays here
    combined_table=[]

    #load in the first column (containing row ids). File doesn't matter since they should all have identical first columns.
    table=LoadTable(filename=output_files[0],header=True,sep='\t')
    row_ids = table.getRawData(columns=[table.Header[0]])
    combined_table.append([table.Header[0]])
    for row_id in row_ids:
        combined_table.append([row_id])

    #Now add the rest of the files to the table
    for i,output_file in enumerate(output_files):
        if verbose:
            print "Combining file {0} of {1}: {2}".format(i,len(output_files),output_file)
        #pull out the second column (first column with actual preditions)
        table=LoadTable(filename=output_file,header=True,sep='\t')
        predictions = table.getRawData(columns=[table.Header[1]])

        #Add the header for our column to the list of headers
        combined_table[0].append(table.Header[1])

        #Add rest of values in the column
        j=1
        for prediction in predictions:
            combined_table[j].append(prediction)
            j+=1

    return combined_table
Esempio n. 13
0
def BowtieToTable(data, row_converter=row_converter):
    """Converts bowtie output to a table
    
    Arguments:
        - row_converter: if not provided, uses a default converter which casts
          the Offset and Other Matches fields to ints. If set to None, all
          returned data will be strings (this is faster).
    """
    parser = BowtieOutputParser(data, row_converter=row_converter)
    header = parser.next()
    rows = [row for row in parser]
    table = LoadTable(header=header, rows=rows)
    return table
Esempio n. 14
0
    def getMessageTable(self, last_n_lines=None, include_date=False):
        """
            Read the ChipPy.log file return as table, returning
            only the last n lines if passed an int.
        """

        log_file = open(self.log_path)
        records = []
        for line in log_file:
            line = line.strip()
            if len(line) > 0:
                if include_date:
                    records.append(line.split('\t')[0:])
                else:
                    records.append(line.split('\t')[1:])  # don't display date
        log_file.close()

        if records == []:
            return None

        if include_date:
            header = ['Date/time', 'code_block', 'level', 'message', 'value']
        else:
            header = ['code_block', 'level', 'message', 'value']

        if type(last_n_lines) is int:  # return only last n lines of log file
            try:
                table = LoadTable(header=header,
                                  rows=records[-last_n_lines:],
                                  sep='\t')
            except IndexError:
                table = None
        else:
            try:
                table = LoadTable(header=header, rows=records, sep='\t')
            except IndexError:
                table = None
        return table
Esempio n. 15
0
    def SummaryStats(self):
        """Return a summary table with the statistic value(s) calculated for the
        the full data-set, the jackknife statistics and standard errors."""

        # if the statistics haven't been run yet.
        if self._jackknifed_stat is None:
            self.jackknife()

        header = ['Sample Stat', 'Jackknife Stat', 'Standard Error']
        title = 'Summary Statistics'
        rows = np.vstack((self._sample_statistic, self._jackknifed_stat,
                          self._standard_error))
        rows = rows.transpose()
        return LoadTable(header=header, rows=rows, title=title)
Esempio n. 16
0
def tabulate(d, transpose=False, key_fun=None):
    """
    d is a dictionary, keyed by tuple(A, B).
    Goal is to put A in rows, B in columns, report data in table form.

    >>> d = {(1,'a'):3, (1,'b'):4, (2,'a'):5, (2,'b'):0}
    >>> print tabulate(d)
    ===========
    o    a    b
    -----------
    1    3    4
    2    5    0
    -----------
    >>> print tabulate(d, transpose=True)
    ===========
    o    1    2
    -----------
    a    3    5
    b    4    0
    -----------
    """
    from cogent import LoadTable

    pairs = d.keys()
    rows, cols = zip(*pairs)
    if transpose:
        rows, cols = cols, rows

    rows = sorted(set(rows))
    cols = sorted(set(cols))
    header = ["o"] + list(cols)
    table = []
    for r in rows:
        combo = [(r, c) for c in cols]
        if transpose:
            combo = [(c, r) for (r, c) in combo]
        data = [d[x] for x in combo]
        data = ["{0:.1f}".format(x) if isinstance(x, float) else x \
                    for x in data]
        if key_fun:
            data = [key_fun(x) for x in data]
        table.append([str(r)] + data)

    table = LoadTable(header=header, rows=table)

    return table
Esempio n. 17
0
def _get_count_sum_table_per_chrom(counts, genes, upstream_size):
    """returns table of total counts for upstream, exon, intron coords
    """
    rows = []
    header = ['region_type', 'ensembl_id', 'region_rank', 'counts', 'size']

    for gene in genes:
        # if no intron, we discard
        if len(gene.IntronCoords) == 0:
            continue

        rows += _get_exon_counts(gene, counts)
        rows += _get_intron_counts(gene, counts)
        rows += _get_upstream_counts(gene, counts, upstream_size)

    table = LoadTable(header=header, rows=rows)
    return table
Esempio n. 18
0
 def calcGStatistic(self, likelihoods, return_table=False):
     # A Goodness-of-fit statistic
     (self, likelihoods) = self.parallelReconstructColumns(likelihoods)
     
     unambig = (self.ambig == 1.0).nonzero()[0]
     observed = self.counts[unambig].astype(int)
     expected = likelihoods[unambig] * observed.sum()
     #chisq = ((observed-expected)**2 / expected).sum()
     G = 2 * observed.dot(numpy.log(observed/expected))
     
     if return_table:
         motifs = self.getSitePatterns(unambig)
         rows = list(zip(motifs, observed, expected))
         rows.sort(key=lambda row:(-row[1], row[0]))
         table = LoadTable(header=['Pattern', 'Observed', 'Expected'], rows=rows, row_ids=True)
         return (G, table)
     else:
         return G
Esempio n. 19
0
def wagner_for_picrust(tree_path,
                       trait_table_path,
                       gain=None,
                       max_paralogs=None,
                       HALT_EXEC=False):
    '''Runs count application controller given path of tree and trait table and returns a Table'''
    #initialize Count app controller
    count = Count(HALT_EXEC=HALT_EXEC)

    #set the parameters
    if gain:
        count.Parameters['-gain'].on(gain)
    if max_paralogs:
        count.Parameters['-max_paralogs'].on(max_paralogs)

    ###Have to manipulate the trait table some. Need to transpose it and strip ids surrounded in quotes.
    table = LoadTable(filename=trait_table_path, header=True, sep='\t')

    #get the first column (containing row ids)
    genome_ids = table.getRawData(table.Header[0])
    #remove single quotes from the id if they exist
    genome_ids = [str(id).strip('\'') for id in genome_ids]
    #transpose the matrix
    table = table.transposed(new_column_name=table.Header[0])
    #Change the headers
    table = table.withNewHeader(table.Header[1:], genome_ids)
    #write the modified table to a tmp file
    tmp_table_path = get_tmp_filename()
    table.writeToFile(tmp_table_path, sep='\t')

    #Run Count here
    result = count(data=(tree_path, tmp_table_path))

    #Remove tmp file
    remove(tmp_table_path)

    #tree=LoadTree(tree_path)
    tree = DndParser(open(tree_path))

    #parse the results into a Cogent Table
    asr_table = parse_wagner_parsimony_output(result["StdOut"].readlines(),
                                              remove_num_tips=len(tree.tips()))

    #transpose the table
    asr_table = asr_table.transposed(new_column_name='nodes')

    return asr_table
Esempio n. 20
0
 def _get_stats(self, stat, transform=None, **kwargs):
     """returns a table for the indicated statistics"""
     if self._dists is None:
         return None
     rows = []
     for row_name in self.Names:
         row = [row_name]
         for col_name in self.Names:
             if row_name == col_name:
                 row.append('')
                 continue
             val = self._dists[(row_name, col_name)][stat]
             if transform is not None:
                 val = transform(val)
             row.append(val)
         rows.append(row)
     header = [r'Seq1 \ Seq2'] + self.Names
     table = LoadTable(header=header, rows=rows, row_ids = True,
             missing_data='*', **kwargs)
     return table
Esempio n. 21
0
def wagner_for_picrust(tree_path,trait_table_path,gain=None,max_paralogs=None,HALT_EXEC=False):
    '''Runs count application controller given path of tree and trait table and returns a Table'''
    #initialize Count app controller
    count=Count(HALT_EXEC=HALT_EXEC)

    #set the parameters
    if gain:
        count.Parameters['-gain'].on(gain)
    if max_paralogs:
        count.Parameters['-max_paralogs'].on(max_paralogs)

    ###Have to manipulate the trait table some. Need to transpose it and strip ids surrounded in quotes.
    table = LoadTable(filename=trait_table_path,header=True,sep='\t')

    #get the first column (containing row ids)
    genome_ids = table.getRawData(table.Header[0])
    #remove single quotes from the id if they exist
    genome_ids=[str(id).strip('\'') for id in genome_ids]
    #transpose the matrix
    table = table.transposed(new_column_name=table.Header[0])
    #Change the headers
    table=table.withNewHeader(table.Header[1:],genome_ids)
    #write the modified table to a tmp file
    tmp_table_path =get_tmp_filename()
    table.writeToFile(tmp_table_path,sep='\t')
       
    #Run Count here
    result = count(data=(tree_path,tmp_table_path))

    #Remove tmp file
    remove(tmp_table_path)

    #tree=LoadTree(tree_path)
    tree=DndParser(open(tree_path))
    
    #parse the results into a Cogent Table
    asr_table= parse_wagner_parsimony_output(result["StdOut"].readlines(),remove_num_tips=len(tree.tips()))

    #transpose the table
    asr_table = asr_table.transposed(new_column_name='nodes')

    return asr_table
Esempio n. 22
0
import sys
sys.path.extend(['..'])

from cogent import LoadTable
from cogent.util.unit_test import TestCase, main
from cogent.util.misc import remove_files

from chippy.parse.expr_data import _check_expr_headers, _check_diff_headers,\
        _validate_probes_scores, _remove_multimapped_probesets, _read_data_file

_sample_dump = LoadTable(header=['ENSEMBL', 'probeset', 'exp'],
        rows=[['id1',"0|1|2","13.6|13.4|13.6"],
        ['id2',"3|1","9.9|13.6"], # this gene should be lost when filtered
        ['id3',"4|5","12.7|13.4"],
        ['id4',"6","13.4"],
        ['id5',"7|8|3","6.0|6.0|4.5"],
        ['id6',"9|10|11|12","5.4|6.8|6.6|6.2"],
        ['id8',"13","12.7"],
        ['id9',"14","12.7"],
        ['id10',"15","12.7"]])

class TestExprParsing(TestCase):
    """test that excluding probesets works correctly"""

    def test_check_expr_headers(self):
        """ check that headers are identified corrects, as is the presence/
            absence of a probeset column label. Make sure it fails if the
            columns are incorrectly ordered or labelled.
        """
        header_row = ['ENSEMBL', 'probeset', 'exp']
        gene_col, probe_col, exp_col, probes_present = _check_expr_headers(
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False):
    '''Runs the ancestral state reconstructions in parallel'''

    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_jobs_torque.py')
    else:
        raise RuntimeError

    if(verbose):
        print "Loading trait table..."

    #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file
    table=LoadTable(filename=table, header=True, sep='\t')

    #get dimensions of the table
    dim=table.Shape

    created_tmp_files=[]
    output_files=[]
    ci_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(verbose):
        print "Creating temporary input files in: ",tmp_dir

    #iterate over each column
    for i in range(1,dim[1]):
        #create a new table with only a single trait
        single_col_table=table.getColumns([0,i])

        #write the new table to a tmp file
        single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_')
        single_col_table.writeToFile(single_col_fp,sep='\t')
        created_tmp_files.append(single_col_fp)

        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_')
        output_files.append(tmp_output_fp)
        tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_')
        ci_files.append(tmp_ci_fp)

        #create the job command
        cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp)

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()
    created_tmp_files.extend(output_files)
    created_tmp_files.extend(ci_files)

    if(verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix='asr'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs)

    if(verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files)

    if(verbose):
        print "Jobs are done running. Now combining all tmp files."
    #Combine output files
    combined_table=combine_asr_tables(output_files)
    combined_ci_table=combine_asr_tables(ci_files)

    #create a Table object
    combined_table=Table(header=combined_table[0],rows=combined_table[1:])
    combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:])

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)

    #return the combined table
    return combined_table,combined_ci_table
Esempio n. 24
0
def loadtable(header, rows):

    from cogent import LoadTable
    return LoadTable(header=header, rows=rows)
def run_asr_in_parallel(tree, table, asr_method, parallel_method='sge',tmp_dir='jobs/',num_jobs=100, verbose=False):
    '''Runs the ancestral state reconstructions in parallel'''

    asr_script_fp = join(get_picrust_project_dir(),'scripts','ancestral_state_reconstruction.py')

    if(parallel_method=='sge'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_sge.py')
    elif(parallel_method=='multithreaded'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs.py')
    elif(parallel_method=='torque'):
        cluster_jobs_fp=join(get_picrust_project_dir(),'scripts','start_parallel_picrust_jobs_torque.py')
    else:
        raise RuntimeError

    if(verbose):
        print "Loading trait table..."

    #foreach trait in the table, create a new tmp file with just that trait, and create the job command and add it a tmp jobs file
    table=LoadTable(filename=table, header=True, sep='\t')

    #get dimensions of the table
    dim=table.Shape

    created_tmp_files=[]
    output_files=[]
    ci_files=[]

    #create a tmp file to store the job commands (which we will pass to our parallel script to run)
    jobs_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='jobs_asr_')
    jobs=open(jobs_fp,'w')
    created_tmp_files.append(jobs_fp)

    if(verbose):
        print "Creating temporary input files in: ",tmp_dir

    #iterate over each column
    for i in range(1,dim[1]):
        #create a new table with only a single trait
        single_col_table=table.getColumns([0,i])

        #write the new table to a tmp file
        single_col_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='in_asr_')
        single_col_table.writeToFile(single_col_fp,sep='\t')
        created_tmp_files.append(single_col_fp)

        #create tmp output files
        tmp_output_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_')
        output_files.append(tmp_output_fp)
        tmp_ci_fp=get_tmp_filename(tmp_dir=tmp_dir,prefix='out_asr_ci_')
        ci_files.append(tmp_ci_fp)

        #create the job command
        cmd= "{0} -i {1} -t {2} -m {3} -o {4} -c {5}".format(asr_script_fp, single_col_fp, tree, asr_method, tmp_output_fp, tmp_ci_fp)

        #add job command to the the jobs file
        jobs.write(cmd+"\n")

    jobs.close()
    created_tmp_files.extend(output_files)
    created_tmp_files.extend(ci_files)

    if(verbose):
        print "Launching parallel jobs."

    #run the job command
    job_prefix='asr'
    submit_jobs(cluster_jobs_fp ,jobs_fp,job_prefix,num_jobs=num_jobs)

    if(verbose):
        print "Jobs are now running. Will wait until finished."

    #wait until all jobs finished (e.g. simple poller)
    wait_for_output_files(output_files)

    if(verbose):
        print "Jobs are done running. Now combining all tmp files."
    #Combine output files
    combined_table=combine_asr_tables(output_files)
    combined_ci_table=combine_asr_tables(ci_files)

    #create a Table object
    combined_table=Table(header=combined_table[0],rows=combined_table[1:])
    combined_ci_table=Table(header=combined_ci_table[0],rows=combined_ci_table[1:])

    #clean up all tmp files
    for file in created_tmp_files:
        remove(file)

    #return the combined table
    return combined_table,combined_ci_table