def prepare_fasta(id, _run_id,load_ids, species, seq_type, molecule_type, genome_type, outdir): """ Write fasta file adapted for OrthoFinder """ fasta_dir = utils.safe_mkdir('fastas_%s' % (_run_id)) nloads = 0 nseqs = 0 for load_id in load_ids: nloads +=1 taxon = species[load_id].name fasta = os.path.join(fasta_dir, taxon + '.fa') with open(fasta, 'w') as ffasta: for record in database.load_seqs( load_id, taxon, seq_type, molecule_type, genome_type): newid = taxon + "@" + str(record.id) nseqs += 1 utils.write_fasta(ffasta, record.seq, newid) if not nseqs: utils.die("no sequences were written to the FASTA file") diagnostics.log('nloads', nloads) diagnostics.log('nseqs', nseqs) ingest('fasta_dir', 'fasta')
def lookup_species(load_ids): #Taken directly from homologize.py """Lookup the species data for each run""" species = {} # Given a run_id, returns a named tuple with the following elements: # # name The species names # ncbi_id The NCBI taxon id # itis_id The ITIS taxon id # catalog_id The agalma catalog id for load_id in load_ids: row = biolite.database.execute(""" SELECT catalog.species, catalog.ncbi_id, catalog.itis_id, catalog.id FROM catalog, runs WHERE runs.run_id=? AND catalog.id=runs.id;""", (load_id,)).fetchone() if not row: utils.die("Couldn't find species data for run ID %s" % load_id) if row[0] is None: utils.die("Species name is empty for catalog ID '%s'" % row[3]) species[load_id] = SpeciesData(*row) diagnostics.log(str(load_id), row) diagnostics.log("species", species) ingest('species')
def load_orthofinder_cluster(_run_id, ogfile): """Load cluster file from mcl (via orthofinder) into homology database""" nseqs = 0 hist = {} database.execute("BEGIN") database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,)) with open(ogfile, 'r') as f: cluster_id = 0 for line in f: cluster = filter(lambda s : s.isdigit(), re.split(r'[@\s]', line)) n = len(cluster) hist[n] = hist.get(n, 0) + 1 if n >= 4: nseqs += n for seq_id in cluster: database.execute(""" INSERT INTO homology (run_id, component_id, sequence_id) VALUES (?,?,?);""", (_run_id, cluster_id, seq_id)) cluster_id += 1 database.execute("COMMIT") utils.info( "histogram of gene cluster sizes:\n", '\n '.join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist))) diagnostics.log('nseqs', nseqs) diagnostics.log('histogram', hist)
def load_mcl_cluster(_run_id, cluster_file): """Load cluster file from mcl into homology database""" nseqs = 0 hist = {} database.execute("BEGIN") database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,)) with open(cluster_file, "r") as f: cluster_id = 0 for line in f: cluster = filter(lambda s: s[0].isdigit(), line.rstrip().split()) n = len(cluster) hist[n] = hist.get(n, 0) + 1 if n >= 4: nseqs += n for seq_id in cluster: database.execute( """ INSERT INTO homology (run_id, component_id, sequence_id) VALUES (?,?,?);""", (_run_id, cluster_id, seq_id), ) cluster_id += 1 database.execute("COMMIT") utils.info("histogram of gene cluster sizes:\n", "\n ".join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist))) diagnostics.log("nseqs", nseqs) diagnostics.log("histogram", hist)
def write_fasta(id, _run_id, load_ids, species, seq_type, molecule_type, genome_type, outdir): """Write sequences from the Agalma database to a FASTA file""" blast_dir = utils.safe_mkdir("allvall_blast_%s_%s" % (id, _run_id)) fasta = os.path.join(blast_dir, "all.fa") # The nodes file contains a header, which describes the attributes, as well # as a line for every node (transcript) in the analysis. nodes = os.path.join(outdir, "nodes.txt") nloads = 0 nseqs = 0 nbases = 0 with open(nodes, "w") as fnodes, open(fasta, "w") as ffasta: print >> fnodes, "label\tid\tassembly\tassembly_number" for load_id in load_ids: nloads += 1 taxon = species[load_id].name for record in database.load_seqs(load_id, taxon, seq_type, molecule_type, genome_type): nseqs += 1 nbases += len(record.seq) # The id of the node (the second column) is a unique identifier # and because we already have one in the table, use that here. print >> fnodes, "%s\t%d\t%s\t%d" % (record.header, record.id, load_id, nloads) utils.write_fasta(ffasta, record.seq, record.id) if not nseqs: utils.die("no sequences were written to the FASTA file") diagnostics.log_path(nodes, "nodes") diagnostics.log("nloads", nloads) diagnostics.log("nseqs", nseqs) diagnostics.log("nbases", nbases) ingest("blast_dir", "fasta", "nodes")
def parse_edges(_run_id, seq_type, blast_hits, min_overlap, min_bitscore, min_nodes): """Parse BLAST hits into edges weighted by bitscore""" graph = nx.Graph() edge_file = "allvall_edges_%s.abc" % _run_id nseqs = 0 nedges = {"all": 0, "non-self": 0, "passed-overlap": 0, "passed-bitscore": 0, "passed-bitscore-unique": 0} max_bitscore = None last_query = "" # For identification of a new query in the table for f in glob(blast_hits): for line in open(f): # fields: # qseqid sseqid bitscore qlen length id_from, id_to, bitscore, qlen, length = line.rstrip().split() # Sometimes blast outputs a bad query id if the query is longer # than 10Kb if id_from.startswith("Query_"): utils.info("discarding bad hit with query id '%s'" % id_from) continue bitscore = float(bitscore) length = float(length) / float(qlen) # Correct for nucleotide vs. amino acid length ### WHY? lenght here is the overlap proportion # if seq_type == 'nucleotide' # length *= 3.0 # Filter out self hits, low scoring hits, and short hits nedges["all"] += 1 if id_from != last_query: max_bitscore = ( bitscore ) ## The self-hit is not always at the top as the file is sorted by e-values and ties happens. last_query = id_from ## however the self-hit always have the same bitscore as the first hit. if id_from != id_to: nedges["non-self"] += 1 if length > min_overlap: nedges["passed-overlap"] += 1 srv = float(bitscore) / float( max_bitscore ) # SRV: BLAST Score Ratio Values. Blom et al. BMC Bioinformatics 2009, 10:154 doi:10.1186/1471-2105-10-154 if srv > min_bitscore: nedges["passed-bitscore"] += 1 # If an edge already exists between the nodes, update # its score to be the max if graph.has_edge(id_from, id_to): e = graph.edge[id_from][id_to] e["score"] = max(e["score"], bitscore) else: nedges["passed-bitscore-unique"] += 1 graph.add_node(id_from) graph.add_node(id_to) graph.add_edge(id_from, id_to, score=bitscore) diagnostics.prefix.append("nedges") diagnostics.log_dict(nedges) diagnostics.prefix.pop() with open(edge_file, "w") as f: for subgraph in nx.connected_component_subgraphs(graph): nnodes = subgraph.number_of_nodes() if nnodes >= min_nodes: nseqs += nnodes for id_from, id_to in subgraph.edges_iter(): print >> f, "%s\t%s\t%f" % (id_from, id_to, subgraph.edge[id_from][id_to]["score"]) if not nseqs: utils.die("no sequences were written to the FASTA file") diagnostics.log("nseqs", nseqs) ingest("edge_file")