예제 #1
0
 def test_multi_processing_blast(self):
     """ Test that a set of blast runs using multiprocessing run """
     fn_database = os.path.join(self.datadir, "mini_nr", "nr_test2")
     blaster = BLASTUtilities.BLASTMultiProcessing()
     parser = SeqIO.parse(fn_database, "fasta")
     identifier = "temp.{0}"
     i = 0
     n_seqs = 20
     for seq_record in parser:
         if i == n_seqs:
             break
         blaster.add_sequence(seq_record.seq.tostring(),
                              identifier.format(i), fn_database)
         i += 1
     fn_identifier_pairs = blaster.run()
     self.assertEqual(len(fn_identifier_pairs), n_seqs,
                      "Unexpected number of BLAST results")
     blast_parser = BLASTUtilities.BLASTMultiProcessingParser()
     for i, fn in fn_identifier_pairs:
         blast_parser.add_file(identifier.format(i), fn)
     parsing_results = blast_parser.run()
     l = len(parsing_results)
     self.assertEqual(l, n_seqs,
                      "Unexpected number of  parsed results {0}".format(l))
     for i, fn in fn_identifier_pairs:
         os.remove(fn)
    def test_do_blast(self):
        """ Test that a BLAST subprocess runs """
        fn_sequence = os.path.join(self.datadir, "2061973757.fasta")
        fn_database = os.path.join(self.datadir, "mini_nr", "proteins")

        parser = SeqIO.parse(fn_sequence, "fasta")
        S = parser.next()
        identifier = "nothing"
        fn_output= BLASTUtilities.do_blast(S.seq.tostring(),identifier, fn_database)
        self.assertTrue(os.path.exists(fn_output),"BLAST did not produce the output file")

        results = BLASTUtilities.parse_blast(fn_output)
        self.assertEqual(len(results.titles),1)
        self.assertAlmostEqual(947.577, results.bits[0],delta=0.001, msg="Score not correct")
        self.assertAlmostEqual(0, results.evalues[0],delta=1e-5, msg="E-value not correct")
        os.remove(fn_output)
예제 #3
0
    def test_description_parsing(self):
        """ Test the parsing of a blast description

        """
        # File with all the microoganisms in nr.COG1528
        fn_check_file = os.path.join(self.datadir, "nr.COG1528.check_file")
        organisms = set()
        for words in csv.reader(open(fn_check_file), delimiter=" "):
            if len(words) >= 2:
                genus = words[0].lower()
                species = words[1].lower()
                name = genus + " " + species
                organisms.add(name)
        log.debug("organisms in the check file: %s", organisms)
        # Parse all fasta descriptions
        fn_database = os.path.join(self.datadir, "nr.COG1528")
        parser = SeqIO.parse(fn_database, "fasta")
        organisms_parsed = set()
        p = BLASTUtilities.BLASTResult()
        for seq_record in parser:
            map(organisms_parsed.add,
                p.parse_organisms(seq_record.description))
        log.debug("organisms_parsed: %s", organisms_parsed)
        self.assertEqual(len(organisms), len(organisms_parsed),
                         "The number of organisms parsed is not correct")
 def test_parse_blast(self):
     """ Parse a blast result with multiple entries """
     fn = os.path.join(self.datadir, "2061976712.xml")
     results = BLASTUtilities.parse_blast(fn,25)
     self.assertEqual(len(results.titles),25)
     self.assertEqual(len(results.evalues),25)
     self.assertEqual(len(results.scores),25)
     self.assertEqual(len(results.bits),25)
예제 #5
0
 def test_parse_blast(self):
     """ Parse a blast result with multiple entries """
     fn = os.path.join(self.datadir, "2061976712.xml")
     results = BLASTUtilities.parse_blast(fn, 25)
     self.assertEqual(len(results.titles), 25)
     self.assertEqual(len(results.evalues), 25)
     self.assertEqual(len(results.scores), 25)
     self.assertEqual(len(results.bits), 25)
예제 #6
0
def blast(seqs):
    """
        Blast a set of sequences and parse the results. The function does calls the
        MultiProcessing versions
        @seqs A list of tuples of (sequence, identifier for the sequence, database to use
        for the blast procedure)
        @return Returns a list of BLASTResult objects
    """
    if len(seqs) == 0:
       raise ValueError("No sequences provided")
    blaster = BLASTUtilities.BLASTMultiProcessing()
    log.debug("Running blast from %s sequences",len(seqs))
    for seq in seqs:
        blaster.add_sequence(*seq)
    fns_blast_output = blaster.run()
    parser = BLASTUtilities.BLASTMultiProcessingParser()
    for identifier, fn in fns_blast_output:
        parser.add_file(identifier, fn)
    parsing_results = parser.run()
    # clean xmls after parsing
    for i,fn in fns_blast_output:
        os.remove(fn)
    return parsing_results
예제 #7
0
    def test_do_blast(self):
        """ Test that a BLAST subprocess runs """
        fn_sequence = os.path.join(self.datadir, "2061973757.fasta")
        fn_database = os.path.join(self.datadir, "mini_nr", "proteins")

        parser = SeqIO.parse(fn_sequence, "fasta")
        S = parser.next()
        identifier = "nothing"
        fn_output = BLASTUtilities.do_blast(S.seq.tostring(), identifier,
                                            fn_database)
        self.assertTrue(os.path.exists(fn_output),
                        "BLAST did not produce the output file")

        results = BLASTUtilities.parse_blast(fn_output)
        self.assertEqual(len(results.titles), 1)
        self.assertAlmostEqual(947.577,
                               results.bits[0],
                               delta=0.001,
                               msg="Score not correct")
        self.assertAlmostEqual(0,
                               results.evalues[0],
                               delta=1e-5,
                               msg="E-value not correct")
        os.remove(fn_output)
예제 #8
0
def assign_genus_to_scaffolds(args):
    """ Assign genus to scaffolds in the database

    The function:
    1) Reads the genes in the database that belong to a given COG
    2) Reads the BLAST results for each of the genes.
    3) Recovers the best hit (genus and bit score) for the gene and
    identifies the scaffold where the gene is located
    4) Assigns the genus found in the hit to the scaffold.

    Various scaffolds can have different assignments. To select one assignment,
    1) sum the bit scores for the each of the genus assigned to a scaffold.
    2) Chose the genus with the largest total bit score

    Finally, store the assignments in the database
    """
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    names = db.get_tables_names()
    if not db.GenesTable in names:
        raise ValueError("The database does not have a table of genes")
    if not db.BlastResultsTable in names:
        raise ValueError("The database does not have a table of BLAST results")
    # Read file marker cogs
    fhandle = open(args.fn_marker_cogs, "rU")
    reader = csv.reader(fhandle, delimiter=" ")
    marker_cogs = frozenset([row[0] for row in reader])
    if len(marker_cogs) == 0:
        raise ValueError("No marker COGs provided")

    if db.ScaffoldsAssignmentsTable in names:
        db.drop_table(db.ScaffoldsAssignmentsTable)
    db.create_scaffold_assignments_table()

    blast_result = BLASTUtilities.BLASTResult()
    scaffolds_dict = {}
    for cog_id in marker_cogs:
        # read the genes and scaffolds for the cog
        sql_command = """SELECT {0}.gene_id,{0}.scaffold, {0}.dna_length,{1}.titles,{1}.bits
                         FROM {0}
                         INNER JOIN {1}
                         WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id
                      """.format(db.GenesTable, db.BlastResultsTable, cog_id)
        cursor = db.execute(sql_command)
        r = cursor.fetchone()
        while r:
            sc = r["scaffold"]
            organism, bit_score = blast_result.get_best_hit(
                r["titles"], r["bits"])
            genus = organism.split(" ")[0]
            add_to_scaffold_dictionary(scaffolds_dict, sc, genus,
                                       float(bit_score))
            r = cursor.fetchone()

    # Assign the genus with the largest bit score
    data = []
    for scaffold in scaffolds_dict:
        genus, bit_score = max(scaffolds_dict[scaffold].iteritems(),
                               key=operator.itemgetter(1))
        data.append((scaffold, genus, bit_score))
    data = BiologyBasedRules.filter_genus_assignments(data,
                                                      n_appearances=2,
                                                      bit_score_threshold=30)
    db.store_data(db.ScaffoldsAssignmentsTable, data)
    db.close()