def classify_species(filepath, results, domains, protein_families): """Classifies a species using the results from hmmsearch""" recarray = hmmer.parse_csv(filepath) # output base filename target = os.path.splitext(os.path.basename(filepath))[0] # Load contigs with matching domains contigs = load_contigs(target, recarray, domains) # open csv file for output filename = "classification_%s.csv" % target fp = open(os.path.join("../csv/classifications", filename), 'wt') csv_writer = csv.writer(fp) csv_writer.writerow(['contig', 'family', 'type']) # classify contigs classifications = classify_contigs(contigs, protein_families) # collapse related contigs and write classification to csv collapsed = collapse_contig_classifications(target, classifications, csv_writer) # convert to recarray and add to master dict datatypes = [('contig', '|S32'), ('family', '|S64'), ('type', '|S2')] results[target] = np.array(collapsed, dtype=datatypes)
def analyze_hmmer_table(go_terms, pfam2go, hmmer_table, go_level=1): """Analyzes a given HMMER3 search table for Pfam/GO term matches. Arguments --------- go_terms : goatools.obo_parser.GODag A dictionary of GO terms pfam2go : dict Pfam/GO mapping hmmer_table : str filepath to a HMMER3 search output table go_level : int (Optional) the GO level to summarize """ summary = { "unknown": 0 } # parse HMMER output and store in data dict contigs = hmmer.parse_csv(hmmer_table) # iterate through HMMER output for contig in contigs: pfam_domain = contig[2] # check to see if domain is in Pfam2go if pfam_domain not in pfam2go: summary['unknown'] += 1 continue # otherwise get a list of the associated GO terms terms = pfam2go[pfam_domain] # iterate through GO terms for the contig for t in terms: go_term = go_terms[t[0]] # find GO category at the desired level node = go_term for i in range(go_term.level - go_level): node = node.parents[0] # add to summary table category = node.name if not category in summary: summary[category] = 0 summary[category] += 1 return summary