Beispiel #1
0
 def test_kmer_generator(self):
     self.assertEqual(kmer_generator("", 0), {})
     self.assertEqual(kmer_generator("A", 1), {0: ['A', 'T']})
     self.assertEqual(
         kmer_generator(["ATTCG", "GGGG"], 3), {
             0: [
                 'ATT', 'TTC', 'TCG', 'CGA', 'GAT', 'ATC', 'TCG', 'CGA',
                 'GAA', 'AAT'
             ],
             1: ['GGG', 'GGG', 'GGG', 'GGG', 'CCC', 'CCC', 'CCC', 'CCC']
         })
     self.assertEqual(
         kmer_generator(["ATTCG", "GGGG"], 3, extension=False), {
             0: ['ATT', 'TTC', 'TCG', 'CGA', 'GAA', 'AAT'],
             1: ['GGG', 'GGG', 'CCC', 'CCC']
         })
Beispiel #2
0
def compute_match_score(query, template_obj, kmerLength, primers):
    '''
    Inputs:
    - (str) data_prefix: the prefix of the NGS dataset paths
    
    ''' 
    record1, record2, gzip1, gzip2, filename = get_reads_records(query) 
    
    #Run reads through Coarse Filtering to drastically reduce computation for Fine Filtering
    reads = combine_records(record1,record2)
    
    
    flanking_sequences = set(kmer for kmers in kmer_generator([sequence for primer_set in primers.values() for sequence in primer_set], kmerLength, extension=False).values() for kmer in kmers)
    
    nucleotides_seen,recruitedReads = coarse_filtering(reads, kmerLength, template_obj["Kmers"], flanking_sequences)
    
    # Close all the things
    record1.close()
    if record2 is not None:
        record2.close()
    if gzip1 is not None:
        gzip1.close()
    if gzip2 is not None:
        gzip2.close()

    coverage = nucleotides_seen/1000000.0 #assuming all genomes are roughly the same length - nucleotides seen should be proportional to coverage

    #Run reads through Fine Filtering to get score for each template
    matchScore, flanking_coverage = fine_filtering(template_obj, recruitedReads, kmerLength, primers)
    print(matchScore)
    print(flanking_coverage)
    matchScore = [score/float(1+flanking_coverage[i]) for i,score in enumerate(matchScore)]
    print(matchScore)
    print("\n")
    return matchScore, filename
Beispiel #3
0
    def setUp(self):
        self.PATH_PREFIX = PATH_PREFIX
        
        templates = list(SeqIO.parse(resource_filename('prince.resources', "templates.fasta"), "fasta"))
        templateNames = [t.id for t in templates]
        templates = [str(t.seq) for t in templates]

        #Generate k-mers
        templateKmers = kmer_generator(templates, 25)
        self.template_obj = {"Names":templateNames, "Sequences":templates, "Kmers":templateKmers}
        
        with open(resource_filename('prince.resources', "TB_primers_extended.json")) as primers:
            self.primers=json.load(primers)