def main():
    """Main application"""
    # parse csv files for each species
    recarrays = [hmmer.parse_csv(x) for x in sys.argv[1:]]

    # protein domain counts csv
    write_counts_csv(recarrays, '../csv/domains/tap_domain_frequencies.csv')
def classify_species(filepath, results, domains, protein_families):
    """Classifies a species using the results from hmmsearch"""
    recarray = hmmer.parse_csv(filepath)
    
    # output base filename
    target = os.path.splitext(os.path.basename(filepath))[0]
           
    # Load contigs with matching domains
    contigs = load_contigs(target, recarray, domains)

    # open csv file for output
    filename =  "classification_%s.csv" % target
    fp = open(os.path.join("../csv/classifications", filename), 'wt')
    csv_writer = csv.writer(fp)
    csv_writer.writerow(['contig', 'family', 'type'])

    # classify contigs
    classifications = classify_contigs(contigs, protein_families)

    # collapse related contigs and write classification to csv
    collapsed = collapse_contig_classifications(target, classifications, 
                                                csv_writer)

    # convert to recarray and add to master dict
    datatypes = [('contig', '|S32'), ('family', '|S64'), ('type', '|S2')]
    results[target] = np.array(collapsed, dtype=datatypes)
def main():
    """Main application"""
    # parse csv files for each species
    recarrays = [hmmer.parse_csv(x) for x in sys.argv[1:]]

    # protein domain counts csv
    write_counts_csv(recarrays, '../csv/domains/tap_domain_frequencies.csv')
def main():
    """Main"""
    recarrays = [hmmer.parse_csv(x) for x in sys.argv[1:]]
    
    for i, filepath in enumerate(sys.argv[1:]):
        filename, ext = os.path.splitext(os.path.basename(filepath))
        title = "%s TAP domain" % filename
        hmmer.plot_evalue_histogram(recarrays[i], filename, title)
Ejemplo n.º 5
0
def analyze_hmmer_table(go_terms, pfam2go, hmmer_table, go_level=1):
    """Analyzes a given HMMER3 search table for Pfam/GO term matches.
    
    Arguments
    ---------
    go_terms : goatools.obo_parser.GODag
        A dictionary of GO terms
    pfam2go : dict
        Pfam/GO mapping
    hmmer_table : str
        filepath to a HMMER3 search output table
    go_level : int
        (Optional) the GO level to summarize
    """
    summary = {
        "unknown": 0
    }

    # parse HMMER output and store in data dict
    contigs = hmmer.parse_csv(hmmer_table)
    
    # iterate through HMMER output
    for contig in contigs:
        pfam_domain = contig[2]
        
        # check to see if domain is in Pfam2go
        if pfam_domain not in pfam2go:
            summary['unknown'] += 1
            continue
        
        # otherwise get a list of the associated GO terms
        terms = pfam2go[pfam_domain]
        
        # iterate through GO terms for the contig
        for t in terms:
            go_term = go_terms[t[0]]
    
            # find GO category at the desired level
            node = go_term
            
            for i in range(go_term.level - go_level):
                node = node.parents[0]
                
            # add to summary table
            category = node.name
            
            if not category in summary:
                summary[category] = 0
                
            summary[category] += 1
    
    return summary
Ejemplo n.º 6
0
def main():
    """Main"""
    targets = {}

    # read in files
    for filepath in sys.argv[1:]:
        # species name
        name = os.path.splitext(os.path.basename(filepath))[0]

        # parse HMMER output and store in data dict
        recarray = hmmer.parse_csv(filepath)
        targets[name] = set(recarray['query_name'])

        # output basic statistics
        print("(%s) # Domains: %d (%d unique)" %
              (name, recarray.shape[0], len(targets[name])))

    # pairwise comparisons
    write_correlation_csv(targets)
    write_correlation_csv(targets, True)
def main():
    """Main"""
    targets = {}
    
    # read in files
    for filepath in sys.argv[1:]:
        # species name
        name = os.path.splitext(os.path.basename(filepath))[0]
        
        # parse HMMER output and store in data dict
        recarray = hmmer.parse_csv(filepath)
        targets[name] = set(recarray['query_name'])
        
        # output basic statistics
        print("(%s) # Domains: %d (%d unique)" % (name, recarray.shape[0],
                                                  len(targets[name])))
        
    # pairwise comparisons
    write_correlation_csv(targets)
    write_correlation_csv(targets, True)