Exemple #1
0
def prepare_fasta(id, _run_id,load_ids, species, seq_type,
        molecule_type, genome_type, outdir):
    """ Write fasta file adapted for OrthoFinder """


    fasta_dir = utils.safe_mkdir('fastas_%s' % (_run_id))


    nloads = 0
    nseqs = 0

    for load_id in load_ids:
        nloads +=1
        taxon = species[load_id].name
        fasta = os.path.join(fasta_dir, taxon + '.fa')
        with open(fasta, 'w') as ffasta:
            for record in database.load_seqs(
                    load_id, taxon, seq_type,                                molecule_type, genome_type):
                newid = taxon + "@" + str(record.id)
                nseqs += 1
                utils.write_fasta(ffasta, record.seq, newid)
    if not nseqs:
        utils.die("no sequences were written to the FASTA file")
    diagnostics.log('nloads', nloads)
    diagnostics.log('nseqs', nseqs)


    ingest('fasta_dir', 'fasta')
Exemple #2
0
def lookup_species(load_ids): #Taken directly from homologize.py
    """Lookup the species data for each run"""

    species = {}

    # Given a run_id, returns a named tuple with the following elements:
    #
    # name        The species names
    # ncbi_id        The NCBI taxon id
    # itis_id        The ITIS taxon id
    # catalog_id    The agalma catalog id

    for load_id in load_ids:
        row = biolite.database.execute("""
            SELECT catalog.species, catalog.ncbi_id, catalog.itis_id, catalog.id
            FROM catalog, runs
            WHERE runs.run_id=? AND catalog.id=runs.id;""",
            (load_id,)).fetchone()
        if not row:
            utils.die("Couldn't find species data for run ID %s" % load_id)
        if row[0] is None:
            utils.die("Species name is empty for catalog ID '%s'" % row[3])
        species[load_id] = SpeciesData(*row)
        diagnostics.log(str(load_id), row)

    diagnostics.log("species", species)
    ingest('species')
def prepare_blast(blast_dir, fasta, seq_type, genesets):
    """Prepare all-by-all BLAST database and command list"""

    db = os.path.join(blast_dir, "db")

    if genesets:
        if seq_type == "nucleotide":
            program = "blastx"
        else:
            program = "blastp"
        for geneset in genesets:
            utils.cat_to_file(geneset, db + ".fa")
            # Also add genesets to the query file, to find overlapping genes
            # between sets.
            utils.cat_to_file(geneset, fasta)
        wrappers.MakeBlastDB(db + ".fa", db, "prot")
    else:
        if (seq_type == "masked_protein") or (seq_type == "protein"):
            dbtype = "prot"
            program = "blastp"
        elif seq_type == "nucleotide":
            dbtype = "nucl"
            program = "blastn"
        else:
            utils.die("unrecognized sequence type '%s'" % seq_type)
        wrappers.MakeBlastDB(fasta, db, dbtype)

    command = program + " -db db -evalue 1e-20" + ' -outfmt "6 qseqid sseqid bitscore qlen length"'
    commands = workflows.blast.split_query(fasta, command, 100000, blast_dir)

    ingest("commands")
def write_fasta(id, _run_id, load_ids, species, seq_type, molecule_type, genome_type, outdir):
    """Write sequences from the Agalma database to a FASTA file"""

    blast_dir = utils.safe_mkdir("allvall_blast_%s_%s" % (id, _run_id))
    fasta = os.path.join(blast_dir, "all.fa")

    # The nodes file contains a header, which describes the attributes, as well
    # as a line for every node (transcript) in the analysis.
    nodes = os.path.join(outdir, "nodes.txt")

    nloads = 0
    nseqs = 0
    nbases = 0

    with open(nodes, "w") as fnodes, open(fasta, "w") as ffasta:
        print >> fnodes, "label\tid\tassembly\tassembly_number"
        for load_id in load_ids:
            nloads += 1
            taxon = species[load_id].name
            for record in database.load_seqs(load_id, taxon, seq_type, molecule_type, genome_type):
                nseqs += 1
                nbases += len(record.seq)
                # The id of the node (the second column) is a unique identifier
                # and because we already have one in the table, use that here.
                print >> fnodes, "%s\t%d\t%s\t%d" % (record.header, record.id, load_id, nloads)
                utils.write_fasta(ffasta, record.seq, record.id)

    if not nseqs:
        utils.die("no sequences were written to the FASTA file")

    diagnostics.log_path(nodes, "nodes")

    diagnostics.log("nloads", nloads)
    diagnostics.log("nseqs", nseqs)
    diagnostics.log("nbases", nbases)

    ingest("blast_dir", "fasta", "nodes")
def parse_edges(_run_id, seq_type, blast_hits, min_overlap, min_bitscore, min_nodes):
    """Parse BLAST hits into edges weighted by bitscore"""

    graph = nx.Graph()
    edge_file = "allvall_edges_%s.abc" % _run_id

    nseqs = 0
    nedges = {"all": 0, "non-self": 0, "passed-overlap": 0, "passed-bitscore": 0, "passed-bitscore-unique": 0}
    max_bitscore = None
    last_query = ""  # For identification of a new query in the table

    for f in glob(blast_hits):
        for line in open(f):
            # fields:
            # qseqid sseqid bitscore qlen length
            id_from, id_to, bitscore, qlen, length = line.rstrip().split()
            # Sometimes blast outputs a bad query id if the query is longer
            # than 10Kb
            if id_from.startswith("Query_"):
                utils.info("discarding bad hit with query id '%s'" % id_from)
                continue
            bitscore = float(bitscore)
            length = float(length) / float(qlen)
            # Correct for nucleotide vs. amino acid length  ### WHY? lenght here is the overlap proportion
            # if seq_type == 'nucleotide'
            # 	length *= 3.0
            # Filter out self hits, low scoring hits, and short hits
            nedges["all"] += 1
            if id_from != last_query:
                max_bitscore = (
                    bitscore
                )  ## The self-hit is not always at the top as the file is sorted by e-values and ties happens.
                last_query = id_from  ## however the self-hit always have the same bitscore as the first hit.
            if id_from != id_to:
                nedges["non-self"] += 1
                if length > min_overlap:
                    nedges["passed-overlap"] += 1
                    srv = float(bitscore) / float(
                        max_bitscore
                    )  # SRV: BLAST Score Ratio Values. Blom et al. BMC Bioinformatics 2009, 10:154 doi:10.1186/1471-2105-10-154
                    if srv > min_bitscore:
                        nedges["passed-bitscore"] += 1
                        # If an edge already exists between the nodes, update
                        # its score to be the max
                        if graph.has_edge(id_from, id_to):
                            e = graph.edge[id_from][id_to]
                            e["score"] = max(e["score"], bitscore)
                        else:
                            nedges["passed-bitscore-unique"] += 1
                            graph.add_node(id_from)
                            graph.add_node(id_to)
                            graph.add_edge(id_from, id_to, score=bitscore)

    diagnostics.prefix.append("nedges")
    diagnostics.log_dict(nedges)
    diagnostics.prefix.pop()

    with open(edge_file, "w") as f:
        for subgraph in nx.connected_component_subgraphs(graph):
            nnodes = subgraph.number_of_nodes()
            if nnodes >= min_nodes:
                nseqs += nnodes
                for id_from, id_to in subgraph.edges_iter():
                    print >> f, "%s\t%s\t%f" % (id_from, id_to, subgraph.edge[id_from][id_to]["score"])

    if not nseqs:
        utils.die("no sequences were written to the FASTA file")

    diagnostics.log("nseqs", nseqs)

    ingest("edge_file")