Exemple #1
0
    def test_description_parsing(self):
        """ Test the parsing of a blast description

        """
        # File with all the microoganisms in nr.COG1528
        fn_check_file = os.path.join(self.datadir, "nr.COG1528.check_file")
        organisms = set()
        for words in csv.reader(open(fn_check_file), delimiter=" "):
            if len(words) >= 2:
                genus = words[0].lower()
                species = words[1].lower()
                name = genus + " " + species
                organisms.add(name)
        log.debug("organisms in the check file: %s", organisms)
        # Parse all fasta descriptions
        fn_database = os.path.join(self.datadir, "nr.COG1528")
        parser = SeqIO.parse(fn_database, "fasta")
        organisms_parsed = set()
        p = BLASTUtilities.BLASTResult()
        for seq_record in parser:
            map(organisms_parsed.add,
                p.parse_organisms(seq_record.description))
        log.debug("organisms_parsed: %s", organisms_parsed)
        self.assertEqual(len(organisms), len(organisms_parsed),
                         "The number of organisms parsed is not correct")
Exemple #2
0
def assign_genus_to_scaffolds(args):
    """ Assign genus to scaffolds in the database

    The function:
    1) Reads the genes in the database that belong to a given COG
    2) Reads the BLAST results for each of the genes.
    3) Recovers the best hit (genus and bit score) for the gene and
    identifies the scaffold where the gene is located
    4) Assigns the genus found in the hit to the scaffold.

    Various scaffolds can have different assignments. To select one assignment,
    1) sum the bit scores for the each of the genus assigned to a scaffold.
    2) Chose the genus with the largest total bit score

    Finally, store the assignments in the database
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if not db.GenesTable in names:
        raise ValueError("The database does not have a table of genes")
    if not db.BlastResultsTable in names:
        raise ValueError("The database does not have a table of BLAST results")
    # Read file marker cogs
    fhandle = open(args.fn_marker_cogs, "rU")
    reader = csv.reader(fhandle, delimiter=" ")
    marker_cogs = frozenset([row[0] for row in reader])
    if len(marker_cogs) == 0:
        raise ValueError("No marker COGs provided")

    if db.ScaffoldsAssignmentsTable in names:
        db.drop_table(db.ScaffoldsAssignmentsTable)
    db.create_scaffold_assignments_table()

    blast_result = BLASTUtilities.BLASTResult()
    scaffolds_dict = {}
    for cog_id in marker_cogs:
        # read the genes and scaffolds for the cog
        sql_command = """SELECT {0}.gene_id,{0}.scaffold, {0}.dna_length,{1}.titles,{1}.bits
                         FROM {0}
                         INNER JOIN {1}
                         WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id
                      """.format(db.GenesTable, db.BlastResultsTable, cog_id)
        cursor = db.execute(sql_command)
        r = cursor.fetchone()
        while r:
            sc = r["scaffold"]
            organism, bit_score = blast_result.get_best_hit(
                r["titles"], r["bits"])
            genus = organism.split(" ")[0]
            add_to_scaffold_dictionary(scaffolds_dict, sc, genus,
                                       float(bit_score))
            r = cursor.fetchone()

    # Assign the genus with the largest bit score
    data = []
    for scaffold in scaffolds_dict:
        genus, bit_score = max(scaffolds_dict[scaffold].iteritems(),
                               key=operator.itemgetter(1))
        data.append((scaffold, genus, bit_score))
    data = BiologyBasedRules.filter_genus_assignments(data,
                                                      n_appearances=2,
                                                      bit_score_threshold=30)
    db.store_data(db.ScaffoldsAssignmentsTable, data)
    db.close()