Esempio n. 1
0
def prepare_fasta(id, _run_id,load_ids, species, seq_type,
        molecule_type, genome_type, outdir):
    """ Write fasta file adapted for OrthoFinder """


    fasta_dir = utils.safe_mkdir('fastas_%s' % (_run_id))


    nloads = 0
    nseqs = 0

    for load_id in load_ids:
        nloads +=1
        taxon = species[load_id].name
        fasta = os.path.join(fasta_dir, taxon + '.fa')
        with open(fasta, 'w') as ffasta:
            for record in database.load_seqs(
                    load_id, taxon, seq_type,                                molecule_type, genome_type):
                newid = taxon + "@" + str(record.id)
                nseqs += 1
                utils.write_fasta(ffasta, record.seq, newid)
    if not nseqs:
        utils.die("no sequences were written to the FASTA file")
    diagnostics.log('nloads', nloads)
    diagnostics.log('nseqs', nseqs)


    ingest('fasta_dir', 'fasta')
Esempio n. 2
0
def write_fasta(id, _run_id, load_ids, species, seq_type, molecule_type, genome_type, outdir):
    """Write sequences from the Agalma database to a FASTA file"""

    blast_dir = utils.safe_mkdir("allvall_blast_%s_%s" % (id, _run_id))
    fasta = os.path.join(blast_dir, "all.fa")

    # The nodes file contains a header, which describes the attributes, as well
    # as a line for every node (transcript) in the analysis.
    nodes = os.path.join(outdir, "nodes.txt")

    nloads = 0
    nseqs = 0
    nbases = 0

    with open(nodes, "w") as fnodes, open(fasta, "w") as ffasta:
        print >> fnodes, "label\tid\tassembly\tassembly_number"
        for load_id in load_ids:
            nloads += 1
            taxon = species[load_id].name
            for record in database.load_seqs(load_id, taxon, seq_type, molecule_type, genome_type):
                nseqs += 1
                nbases += len(record.seq)
                # The id of the node (the second column) is a unique identifier
                # and because we already have one in the table, use that here.
                print >> fnodes, "%s\t%d\t%s\t%d" % (record.header, record.id, load_id, nloads)
                utils.write_fasta(ffasta, record.seq, record.id)

    if not nseqs:
        utils.die("no sequences were written to the FASTA file")

    diagnostics.log_path(nodes, "nodes")

    diagnostics.log("nloads", nloads)
    diagnostics.log("nseqs", nseqs)
    diagnostics.log("nbases", nbases)

    ingest("blast_dir", "fasta", "nodes")