def prepare_fasta(id, _run_id,load_ids, species, seq_type, molecule_type, genome_type, outdir): """ Write fasta file adapted for OrthoFinder """ fasta_dir = utils.safe_mkdir('fastas_%s' % (_run_id)) nloads = 0 nseqs = 0 for load_id in load_ids: nloads +=1 taxon = species[load_id].name fasta = os.path.join(fasta_dir, taxon + '.fa') with open(fasta, 'w') as ffasta: for record in database.load_seqs( load_id, taxon, seq_type, molecule_type, genome_type): newid = taxon + "@" + str(record.id) nseqs += 1 utils.write_fasta(ffasta, record.seq, newid) if not nseqs: utils.die("no sequences were written to the FASTA file") diagnostics.log('nloads', nloads) diagnostics.log('nseqs', nseqs) ingest('fasta_dir', 'fasta')
def write_fasta(id, _run_id, load_ids, species, seq_type, molecule_type, genome_type, outdir): """Write sequences from the Agalma database to a FASTA file""" blast_dir = utils.safe_mkdir("allvall_blast_%s_%s" % (id, _run_id)) fasta = os.path.join(blast_dir, "all.fa") # The nodes file contains a header, which describes the attributes, as well # as a line for every node (transcript) in the analysis. nodes = os.path.join(outdir, "nodes.txt") nloads = 0 nseqs = 0 nbases = 0 with open(nodes, "w") as fnodes, open(fasta, "w") as ffasta: print >> fnodes, "label\tid\tassembly\tassembly_number" for load_id in load_ids: nloads += 1 taxon = species[load_id].name for record in database.load_seqs(load_id, taxon, seq_type, molecule_type, genome_type): nseqs += 1 nbases += len(record.seq) # The id of the node (the second column) is a unique identifier # and because we already have one in the table, use that here. print >> fnodes, "%s\t%d\t%s\t%d" % (record.header, record.id, load_id, nloads) utils.write_fasta(ffasta, record.seq, record.id) if not nseqs: utils.die("no sequences were written to the FASTA file") diagnostics.log_path(nodes, "nodes") diagnostics.log("nloads", nloads) diagnostics.log("nseqs", nseqs) diagnostics.log("nbases", nbases) ingest("blast_dir", "fasta", "nodes")