Esempio n. 1
0
    def test_to_fastn(self):
        '''Check conversion to fastq with to_fastq()'''
        sams = [
            sam.SamRecord(
                'ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord(
                'ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord(
                'ID\t65\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord(
                'ID\t129\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord('ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*'),
            sam.SamRecord('ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*')
        ]
        seqs = [
            fastn.Fastq('ID', 'ACGTA', 'IIIII'),
            fastn.Fastq('ID', 'TACGT', 'IIIII'),
            fastn.Fastq('ID/1', 'ACGTA', 'IIIII'),
            fastn.Fastq('ID/2', 'ACGTA', 'IIIII'),
            fastn.Fasta('ID', 'ACGTA'),
            fastn.Fasta('ID', 'TACGT')
        ]

        for i in range(len(sams)):
            self.assertEqual(seqs[i], sams[i].to_fastn())
Esempio n. 2
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, including weirdness in file'''
        f_in = utils.open_file_read('fastn_unittest.fa')
        fa = fastn.Fasta()
        counter = 1

        while fa.get_next_from_file(f_in):
            self.assertEqual(fa, fastn.Fasta(str(counter), 'ACGTA'))
            counter += 1

        utils.close(f_in)
Esempio n. 3
0
    def test_translate(self):
        '''Test nucleatide -> amino acid conversion works on Fasta'''
        fa = fastn.Fasta(
            'ID',
            'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA'
        )

        self.assertEqual(
            fastn.Fasta(
                'ID',
                'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***'
            ), fa.translate())
Esempio n. 4
0
    def test_split_capillary_id(self):
        '''Tests that we get information from a sanger capillary read name OK'''
        ids = [
            'abcde.p1k', 'abcde.x.p1k', 'abcde.p1ka', 'abcde.q1k', 'abcde.w2k'
        ]
        expected = [{
            'prefix': 'abcde',
            'dir': 'fwd',
            'suffix': 'p1k'
        }, {
            'prefix': 'abcde.x',
            'dir': 'fwd',
            'suffix': 'p1k'
        }, {
            'prefix': 'abcde',
            'dir': 'fwd',
            'suffix': 'p1ka'
        }, {
            'prefix': 'abcde',
            'dir': 'rev',
            'suffix': 'q1k'
        }, {
            'prefix': 'abcde',
            'dir': 'unk',
            'suffix': 'w2k'
        }]

        for i in range(len(ids)):
            fa = fastn.Fasta(ids[i], 'A')
            self.assertEqual(fa.split_capillary_id(), expected[i])
def update_perfect_contigs(nucmer_hit, ref_fasta, contigs):
    id = nucmer_hit.ref_name + ":" + str(nucmer_hit.ref_start) + '-' + str(
        nucmer_hit.ref_end)
    contig = fastn.Fasta(
        'x', ref_fasta[nucmer_hit.ref_start - 1:nucmer_hit.ref_end])
    contigs[(nucmer_hit.ref_name, nucmer_hit.ref_start,
             nucmer_hit.ref_end)] = contig
Esempio n. 6
0
 def test_getitem(self):
     '''getitem() should return the right subsequence'''
     seq = 'AACGTGTCA'
     fa = fastn.Fasta('x', seq)
     self.assertEqual(seq[1], fa[1])
     self.assertEqual(seq[0:2], fa[0:2])
     self.assertEqual(seq[1:], fa[1:])
Esempio n. 7
0
 def test_file_reader_fasta(self):
     '''file_reader should iterate through a fasta file correctly'''
     reader = fastn.file_reader('fastn_unittest.fa')
     counter = 1
     for seq in reader:
         self.assertEqual(seq, fastn.Fasta(str(counter), 'ACGTA'))
         counter += 1
Esempio n. 8
0
    def test_contig_coords(self):
        '''contig_coords() should get the coords of all contigs in a sequence correctly'''
        test_seqs = [
            fastn.Fasta('ID', 'ACGT'),
            fastn.Fasta('ID', 'NACGT'),
            fastn.Fasta('ID', 'NNACGT'),
            fastn.Fasta('ID', 'ACGTN'),
            fastn.Fasta('ID', 'ACGTNN'),
            fastn.Fasta('ID', 'NANNCGT'),
            fastn.Fasta('ID', 'ANNCGTNNAAAAA')
        ]

        correct_coords = [[genome_intervals.Interval(0, 3)],
                          [genome_intervals.Interval(1, 4)],
                          [genome_intervals.Interval(2, 5)],
                          [genome_intervals.Interval(0, 3)],
                          [genome_intervals.Interval(0, 3)],
                          [
                              genome_intervals.Interval(1, 1),
                              genome_intervals.Interval(4, 6)
                          ],
                          [
                              genome_intervals.Interval(0, 0),
                              genome_intervals.Interval(3, 5),
                              genome_intervals.Interval(8, 12)
                          ]]

        for i in range(len(test_seqs)):
            gaps = test_seqs[i].contig_coords()
            self.assertListEqual(correct_coords[i], gaps)
Esempio n. 9
0
 def test_search_string(self):
     '''Check that search_string() finds all the hits'''
     fa = fastn.Fasta('X', 'AAA')
     hits = fa.search('G')
     self.assertTrue(len(hits) == 0)
     hits = fa.search('AAA')
     self.assertListEqual(hits, [(0, '+')])
     hits = fa.search('AA')
     self.assertListEqual(hits, [(0, '+'), (1, '+')])
     hits = fa.search('TTT')
     self.assertListEqual(hits, [(0, '-')])
Esempio n. 10
0
    def test_file_to_dict(self):
        '''check file_to_dict9 fills dictionary correctly'''
        d_test = {}
        d = {}
        fastn.file_to_dict('fastn_unittest.fa', d_test)
        for i in range(1, 5):
            d[str(i)] = fastn.Fasta(str(i), 'ACGTA')

        self.assertSequenceEqual(d_test.keys(), d.keys())
        for i in range(1, 5):
            key = str(i)
            self.assertEqual(d_test[key].id, d[key].id)
            self.assertEqual(d_test[key].seq, d[key].seq)
Esempio n. 11
0
    def to_fastn(self):
        if self.qual == '*':
            seq = fastn.Fasta(self.id, self.seq)
        else:
            seq = fastn.Fastq(self.id, self.seq, self.qual)

        if self.query_strand() == '-':
            seq.revcomp()

        if self.is_first_of_pair():
            seq.id += '/1'
        elif self.is_second_of_pair():
            seq.id += '/2'

        return seq
Esempio n. 12
0
    def test_gaps(self):
        '''gaps() should find the gaps in a sequence correctly'''
        test_seqs = [
            fastn.Fasta('ID', 'ACGT'),
            fastn.Fasta('ID', 'NACGT'),
            fastn.Fasta('ID', 'NACGTN'),
            fastn.Fasta('ID', 'ANNCGT'),
            fastn.Fasta('ID', 'NANNCGTNN')
        ]

        correct_gaps = [[], [genome_intervals.Interval(0, 0)],
                        [
                            genome_intervals.Interval(0, 0),
                            genome_intervals.Interval(5, 5)
                        ], [genome_intervals.Interval(1, 2)],
                        [
                            genome_intervals.Interval(0, 0),
                            genome_intervals.Interval(2, 3),
                            genome_intervals.Interval(7, 8)
                        ]]

        for i in range(len(test_seqs)):
            gaps = test_seqs[i].gaps()
            self.assertListEqual(correct_gaps[i], gaps)
Esempio n. 13
0
    def get_next_from_file(self, f):
        line = f.readline()
        if not line:
            return None
        while line == '\n':
            line = f.readline()

        if not line.startswith('DNA : '):
            raise  Error("Error reading caf file. Expected line starting with 'DNA : ...'")

        self.id = line.rstrip().split()[2]

        line = f.readline()
        seq = []

        while line != '\n':
            seq.append(line.rstrip())
            line = f.readline()

        self.seq = fastn.Fasta(self.id, ''.join(seq))

        line = f.readline()
        if not line.startswith('BaseQuality : '):
            raise  Error("Error reading caf file. Expected line starting with 'BaseQuality : ...'")

        quals = [int(x) for x in f.readline().rstrip().split()]
        self.seq = self.seq.to_Fastq(quals)

        line = f.readline()
        assert line == '\n'
        line = f.readline()

        while line not in ['', '\n']:
            a = line.rstrip().split()
            if a[0] == 'Insert_size':
                self.insert_min, self.insert_max = int(a[1]), int(a[2])
            elif a[0] == 'Ligation_no':
                self.ligation = a[1]
            elif a[0] == 'Clone':
                self.clone = a[1]
            elif a[0] == 'Clipping' and a[1] == 'QUAL':
                self.clip_start, self.clip_end = int(a[2]), int(a[3])

            line = f.readline()

        return True
Esempio n. 14
0
    def test_trim_Ns(self):
        '''trim_Ns() should do the right trimming of a sequence'''
        fa = fastn.Fasta('ID', 'ANNANA')
        test_seqs = [
            fastn.Fasta('ID', 'ANNANA'),
            fastn.Fasta('ID', 'NANNANA'),
            fastn.Fasta('ID', 'NANNANAN'),
            fastn.Fasta('ID', 'ANNANAN'),
            fastn.Fasta('ID', 'NNNNNNANNANAN'),
            fastn.Fasta('ID', 'NNANNANANn')
        ]

        for s in test_seqs:
            s.trim_Ns()
            self.assertEqual(fa, s)
Esempio n. 15
0
    def test_get_differences_from_ref(self):
        '''check test_get_differences_from_ref finds the correct differences'''
        ref = fastn.Fasta('ID', 'ACGTACGTACGT')
        c = cigar.Cigar("12M")

        pairs_to_check = [(cigar.Cigar("12M"), 'ACGTACGTACGT'),
                          (cigar.Cigar("12M"), 'AGGTACGTACGT'),
                          (cigar.Cigar("1S12M"), 'AAGGTACGTACGT'),
                          (cigar.Cigar("1S12M1S"), 'AAGGTACGTACGTA'),
                          (cigar.Cigar("1M1I10M"), 'AiCGTACGTACGT'),
                          (cigar.Cigar("3M1I3M1D3M"), 'AGGiTACTACGT'),
                          (cigar.Cigar("2S3M1I3M1D3M5S"), 'ssAGGiTACTACGTsssss')]
        correct_answers = [[],
                           [(1, 'S', 'C/G', 1)],
                           [(1, 'S', 'C/G', 1)],
                           [(1, 'S', 'C/G', 1)],
                           [(1, 'I', 'i', 1)],
                           [(1, 'S', 'C/G', 1), (3, 'I', 'i', 1), (6, 'D', 'G', 1)],
                           [(1, 'S', 'C/G', 1), (3, 'I', 'i', 1), (6, 'D', 'G', 1)]]

        for i in range(len(pairs_to_check)):
            self.assertListEqual(pairs_to_check[i][0].get_differences_from_ref(pairs_to_check[i][1], ref), correct_answers[i])
Esempio n. 16
0
    def test_strip_illumina_suffix(self):
        '''Check that /1 and /2 removed correctly from IDs'''
        seqs = [
            fastn.Fasta('name/1', 'A'),
            fastn.Fasta('name/2', 'A'),
            fastn.Fasta('name', 'A'),
            fastn.Fasta('name/1/2', 'A'),
            fastn.Fasta('name/2/1', 'A'),
            fastn.Fasta('name/3', 'A')
        ]

        correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3']

        for seq in seqs:
            seq.strip_illumina_suffix()

        for i in range(len(seqs)):
            self.assertEqual(seqs[i].id, correct_names[i])
    'Used to generate a fake set of contigs from a genome. At regular intervals it puts in a gap and then breaks into contigs',
    usage='%(prog)s <infile> <gap length> <contig length> <outfile>')
parser.add_argument('infile', help='Name of fasta/q file to be read')
parser.add_argument('gap_length', type=int, help='Length of gaps to be added')
parser.add_argument('contig_length', type=int, help='Length of each contig')
parser.add_argument('outfile', help='Name of output fasta file')
options = parser.parse_args()

seq_reader = fastn.file_reader(options.infile)
f_out = utils.open_file_write(options.outfile)

for seq in seq_reader:
    if len(seq) < 2 * options.contig_length + options.gap_length:
        print('Sequence',
              seq.id,
              'too short (',
              len(seq),
              'bases). Skipping',
              file=sys.stderr)

    i = 0

    while i + options.contig_length < len(seq):
        contig = fastn.Fasta(
            seq.id + ':' + str(i + 1) + '-' + str(i + options.contig_length),
            seq[i:i + options.contig_length])
        print(contig, file=f_out)
        i += options.contig_length + options.gap_length

utils.close(f_out)
options = parser.parse_args()

gaps = {}
if options.gaps_file:
    f = utils.open_file_read(options.gaps_file)
    for line in f:
        (id, start, end) = line.rstrip().split('\t')
        gap = genome_intervals.Interval(int(start) - 1, int(end) - 1)
        if id not in gaps:
            gaps[id] = []
        gaps[id].append(gap)
    utils.close(f)

f_in = utils.open_file_read(options.fai_file)
f_out = utils.open_file_write(options.outfile)

for line in f_in:
    a = line.rstrip().split()
    fa = fastn.Fasta(a[0], 'A' * int(a[1]))

    if fa.id in gaps:
        fa.seq = list(fa.seq)
        for gap in gaps[fa.id]:
            fa.seq[gap.start:gap.end + 1] = ['N'] * len(gap)
        fa.seq = ''.join(fa.seq)

    print(fa, file=f_out)

utils.close(f_in)
utils.close(f_out)
Esempio n. 19
0
 def test_revcomp(self):
     '''revcomp() should correctly reverse complement a sequence'''
     fa = fastn.Fasta('ID', 'ACGTNacgtn')
     fa.revcomp()
     self.assertEqual(fa, fastn.Fasta('ID', 'nacgtNACGT'))
    type=int,
    help=
    'Seed for random number generator. Default is to use python\'s default',
    default=None)
parser.add_argument('contigs', type=int, help='Nunber of contigs to make')
parser.add_argument('length', type=int, help='Length of each contig')
parser.add_argument('outfile', help='Name of output file')
options = parser.parse_args()

random.seed(a=options.seed)

fout = utils.open_file_write(options.outfile)
letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
letters_index = 0

for i in range(options.contigs):
    if options.name_by_letters:
        name = letters[letters_index]
        letters_index += 1
        if letters_index == len(letters):
            letters_index = 0
    else:
        name = str(i + options.first_number)

    fa = fastn.Fasta(
        options.prefix + name,
        ''.join([random.choice('ACGT') for x in range(options.length)]))
    print(fa, file=fout)

utils.close(fout)
Esempio n. 21
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read caf records from file correctly'''

        f_in = utils.open_file_read('caf_unittest.caf')

        c = caf.Caf()
        c.get_next_from_file(f_in)

        e = caf.Caf()
        e.id = 'pknbac5b2Aa01.p1k'
        seq = ''.join(['NGGAGAGACTCGGACTAGTTCTACACCCTCACACCTTTGTCCTAAACCTTGAATCTAAGT'
                       'CCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCCTACGACC',
                       'CTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCAACACCCT',
                       'TACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCAACAACCTTCATTC',
                       'CAACACCCCAAACAACATCATTCCAACACCCCAAACAACATCATTCCAACACCCCAAACA',
                       'ACATCATTCCAACACCCCAAACAACATCATTCCAACACGGCAACAACATCATTCGAACAC',
                       'CCCTACAACATCATTCCAGCACCCCAACAACCTCCCTGCGAAACCCCGAATCCGAATTTT',
                       'GACACCCCTACAACCTTATTCTGACACCCCCAACAAACTTTCTCTAACACCCCAACAACG',
                       'TGACTACTAATACACCTAAAACCTTACTCCTAAACCCGGAATCCGACTTCTAATACCGCA',
                       'ACAACCTTCATTCCTAAACCCGGAATCTGAACCCTGAACCATTAAAACATAAAACGTGGA',
                       'AAATGAACCCCTGAACCATGAAAACCGTGAAAACCTATAACTTGGACCATGAACCTCTCA',
                       'ACCCCGAAATATGAGAACTTTGGAAACCCTAAATTTTGGGAAAACTCCTTTTTTTTTTTT',
                       'TTATTGTACATCCTGTGCGATGGTATACATTTTGGCGAATGCAAAAGAATTAGCATATAT',
                       'ATATGTGTAGGTCTTTGTGATGGTCAGGGGGGAGATCGACTAGGGTGTAGGTCTTTGTGA',
                       'TGGTCAAGGGAGATGGGCCAAAGGGAAGTCGGACAAGGTGAGATGGGCCAAGGAGATGGG',
                       'CCTAGGGTGGATGGGACAAGGGTGGATGGTCAGAGGTGGATGGTCAAGGGTGGATGGTCA',
                       'AGGATGAATGGGCAAGGGAGATGGGCAAAGTAGATGGGCAAGGGTGGATGGACAAGGTGG',
                       'ATGGCCAAAGTGGATGGCAAGGAGGATGGCCCAGGTAATAGGCAAGGAAATGGCCAGGTG',
                       'GATGGACCAGGTGGTGCCCTAATGGAGGCAGGGTGAAGTCCAGGAGGAGGCCCAGGAAAA',
                       'GGCCCAGAGAAACCCAAGGAAAGGCCCAGGGGGTGGGACAGGGGAAGCGCCAAGGGATGC',
                       'CAAGGTGGGGGCCAGAAAATAGCCCAGAAAAGGCCAAAATAAGCCAAGAAAAGCCCCAGA',
                       'AAACCCAAGAAA'])

        quals = [4, 4, 4, 4, 6, 6, 8, 6, 6, 6, 6, 10, 12, 11, 13, 13, 20, 19, 9, 10, 9, 9, 9, 19, 19, 34, 34, 39, 35, 35, 35, 37, 35, 34, 26, 26, 16, 17, 11, 21, 21, 32, 35, 37, 37, 32, 45, 23, 17, 17, 18, 27, 29, 32, 35, 32, 32, 32, 32, 39, 35, 35, 35, 35, 35, 37, 42, 31, 31, 14, 13, 13, 25, 25, 35, 40, 33, 29, 23, 23, 15, 25, 24, 35, 35, 35, 35, 23, 36, 18, 18, 23, 28, 33, 29, 29, 32, 32, 32, 32, 35, 35, 32, 35, 35, 32, 35, 35, 44, 44, 37, 35, 28, 26, 24, 19, 23, 30, 33, 40, 32, 32, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 37, 30, 30, 27, 27, 21, 21, 21, 29, 26, 29, 29, 23, 23, 28, 37, 37, 50, 50, 40, 35, 35, 32, 32, 32, 35, 44, 37, 35, 35, 35, 35, 35, 35, 32, 32, 35, 35, 35, 35, 44, 42, 42, 41, 41, 41, 41, 41, 42, 41, 41, 41, 41, 41, 41, 44, 44, 42, 42, 42, 42, 42, 35, 37, 35, 35, 33, 37, 37, 44, 44, 44, 41, 42, 50, 42, 42, 42, 44, 44, 50, 50, 44, 44, 44, 44, 44, 44, 50, 50, 44, 44, 44, 44, 44, 41, 42, 44, 42, 42, 42, 44, 44, 42, 42, 41, 41, 41, 42, 44, 50, 50, 50, 44, 44, 44, 44, 44, 37, 37, 37, 37, 39, 41, 41, 44, 44, 44, 44, 47, 47, 44, 44, 44, 43, 43, 42, 42, 37, 37, 37, 41, 41, 42, 44, 44, 44, 44, 44, 42, 42, 42, 41, 41, 41, 44, 44, 44, 46, 42, 41, 37, 37, 37, 37, 37, 41, 42, 35, 35, 35, 35, 35, 35, 35, 42, 41, 42, 44, 50, 42, 42, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 37, 37, 41, 44, 44, 47, 37, 37, 33, 33, 33, 27, 27, 37, 37, 47, 47, 47, 47, 47, 50, 44, 44, 42, 50, 35, 35, 35, 42, 42, 44, 50, 50, 50, 42, 42, 42, 42, 35, 35, 37, 42, 50, 44, 44, 44, 44, 44, 44, 47, 47, 47, 47, 44, 50, 44, 44, 44, 44, 47, 47, 44, 47, 50, 50, 50, 48, 37, 17, 17, 13, 22, 22, 35, 36, 42, 42, 35, 35, 35, 37, 37, 42, 50, 35, 35, 35, 35, 37, 37, 35, 35, 33, 33, 33, 33, 42, 42, 42, 41, 41, 41, 41, 41, 41, 50, 37, 44, 44, 44, 42, 37, 37, 21, 21, 21, 33, 33, 42, 50, 50, 44, 44, 44, 44, 44, 44, 44, 42, 37, 44, 44, 44, 44, 42, 42, 42, 42, 42, 44, 44, 44, 50, 50, 44, 44, 44, 37, 37, 35, 33, 33, 21, 21, 33, 33, 33, 41, 42, 42, 41, 44, 44, 44, 44, 42, 42, 42, 42, 44, 41, 44, 37, 42, 37, 41, 41, 42, 42, 50, 50, 44, 44, 44, 44, 42, 42, 27, 33, 27, 33, 33, 37, 37, 50, 35, 35, 35, 37, 37, 44, 44, 50, 44, 44, 44, 37, 37, 35, 31, 31, 37, 37, 44, 44, 44, 44, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 37, 37, 28, 28, 23, 28, 26, 33, 33, 33, 29, 29, 29, 33, 35, 46, 33, 23, 23, 26, 33, 33, 50, 44, 37, 37, 30, 37, 37, 42, 50, 30, 30, 30, 37, 37, 23, 28, 15, 15, 11, 15, 27, 37, 33, 37, 26, 26, 28, 37, 42, 48, 48, 37, 23, 23, 23, 31, 31, 33, 23, 23, 24, 31, 31, 31, 33, 24, 25, 21, 21, 21, 28, 31, 33, 42, 42, 42, 42, 44, 44, 44, 44, 30, 23, 16, 10, 10, 16, 24, 33, 24, 24, 24, 30, 33, 36, 42, 42, 44, 44, 42, 39, 39, 33, 46, 27, 28, 28, 33, 33, 37, 37, 37, 22, 22, 17, 19, 19, 33, 31, 33, 27, 27, 18, 18, 24, 29, 32, 33, 35, 33, 40, 40, 37, 34, 27, 27, 14, 14, 13, 13, 18, 12, 20, 25, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 47, 56, 56, 56, 47, 42, 42, 42, 42, 27, 23, 15, 11, 11, 27, 33, 42, 42, 33, 24, 10, 10, 10, 13, 15, 18, 14, 14, 14, 14, 14, 25, 30, 33, 30, 21, 21, 27, 27, 22, 15, 13, 22, 22, 19, 19, 15, 15, 11, 10, 17, 27, 27, 21, 15, 18, 13, 13, 16, 22, 24, 37, 31, 40, 40, 37, 47, 40, 37, 27, 27, 24, 24, 17, 20, 13, 10, 10, 11, 11, 14, 12, 19, 10, 10, 12, 14, 11, 10, 10, 10, 10, 15, 11, 15, 15, 25, 12, 12, 8, 8, 10, 17, 10, 21, 21, 8, 8, 8, 10, 10, 19, 25, 21, 19, 10, 10, 8, 9, 10, 12, 14, 17, 24, 22, 16, 16, 10, 9, 8, 14, 12, 12, 9, 9, 9, 9, 9, 9, 13, 19, 15, 18, 22, 22, 15, 15, 15, 15, 9, 10, 9, 8, 8, 9, 10, 14, 10, 10, 19, 15, 12, 9, 15, 4, 4, 4, 8, 8, 10, 12, 9, 8, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 16, 10, 10, 10, 8, 7, 7, 7, 7, 7, 13, 20, 19, 15, 15, 10, 10, 8, 8, 10, 10, 10, 15, 10, 8, 8, 9, 8, 9, 10, 11, 10, 8, 8, 8, 8, 8, 4, 8, 4, 7, 7, 9, 13, 16, 11, 10, 12, 11, 13, 8, 8, 8, 8, 8, 9, 10, 9, 9, 9, 8, 8, 8, 12, 8, 9, 9, 11, 10, 10, 7, 7, 9, 7, 8, 9, 11, 10, 9, 10, 9, 10, 7, 7, 7, 9, 8, 8, 10, 8, 8, 4, 7, 4, 4, 4, 4, 4, 8, 7, 7, 8, 9, 9, 7, 7, 9, 9, 9, 9, 8, 7, 7, 7, 7, 10, 10, 7, 8, 8, 9, 10, 10, 10, 14, 13, 9, 8, 7, 7, 7, 6, 6, 7, 7, 6, 6, 6, 6, 6, 8, 15, 10, 8, 8, 8, 8, 6, 7, 6, 6, 6, 6, 7, 7, 7, 8, 7, 4, 4, 4, 6, 6, 6, 6, 7, 13, 7, 7, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 9, 7, 7, 7, 6, 6, 6, 6, 6, 7, 9, 7, 7, 7, 8, 10, 8, 8, 8, 8, 9, 6, 6, 6, 6, 6, 6, 6, 7, 7, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 7, 6, 6, 7, 9, 7, 7, 11, 6, 6, 7, 6, 6, 8, 7, 7, 8, 8, 10, 8, 8, 8, 6, 6, 7, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7, 7, 12, 9, 14, 10, 10, 10, 10, 8, 9, 8, 8, 8, 7, 7, 7, 7, 7, 13, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 6, 6, 6, 6, 6, 8, 8, 8, 9, 7, 7, 7, 8, 9, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 6, 9, 6, 6, 6, 7, 7, 7, 7, 7, 8, 10, 10, 14, 9, 12, 7, 7, 7, 4, 4]
        e.seq = fastn.Fasta(e.id, seq)
        e.seq = e.seq.to_Fastq(quals)
        e.insert_min = 2000
        e.insert_max = 4000
        e.ligation = '96781'
        e.clone = 'pknbac5b2'
        e.clip_start = 23
        e.clip_end = 789

        self.assertEqual(c, e)

        c.get_next_from_file(f_in)

        e = caf.Caf()
        e.id = 'pknbac5b2Aa02.p1k'

        seq = ''.join(['AAAGACATACGACCTTTTTTTTTTTCGATAACAAAGGGTATCCTTTCACCAGAAAAAAAA',
                       'AAAGAACATTCTTCTTTTTTCTTGAAGAACATACATTCTTTTTTTTATTTTATTTTTTTT',
                       'TTTCGACCCCTCAGTGTTGTGGTAGCATGATGTGTTGGACTTGAATGGTATATGTATTGA',
                       'TTGTTTCGTTCGTTATGTAATTTCCGGTTTTTCCCCGTGGCATCCGGATAGTGTATAGTA',
                       'TCCGGTCCCTGTGTTCAAAAAGTTTTTCCTTTTCCCCTTAAAGCAACTGAAGTTAAACCC',
                       'TGAACCTTACTACTGAACCCGGAATTTGACTTCTAAAACCCTGAAGAATGATTCCTATAA',
                       'CCCTAAAAAATCCAACCTAAAACATCCAAACTGAACCATAGAACCTTCCTCCTAAACCCG',
                       'GAATCTATGTTCTAACACCCTGACATCTTTGTCCTAAACCCTGAATCTAAGTTCTAACAT',
                       'CCTGACAACTCTCCCTCCTAAACCCGGAATCTAAATTCGTACACCCTGACACCTCCCCCC',
                       'TAAACCCGGAATCCGCATTCTAACACCCTGACAATTTCCTCCTGAAAAGCGGAATCTGAC',
                       'TTCTAACACCCTGACACCTTTGTCCTGAACCCGGAATCTAAGTTCTTACACCCGGACACC',
                       'TCCCTCCTAAATCCGGAATCTAAGTTCTAACACCCTCACACCTTTGTCCTAAACCTTGAA',
                       'TCTAAGTCCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCC',
                       'TACGACCCTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCA',
                       'ACACCCTTACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCTACAACT',
                       'TCATTCCTACACCCCAAACAACATCATCCCTACACCCCAAACAACATCATTCCTACACCC',
                       'CAAACACATCATCCAACACCCCATAACACATCATTCCAACACGGCAACAACATCATTCGA',
                       'AACACCCCTACAAATCATTGCAGCACCCCCACTACCTCCCTGCGTATACCCGTATTCGAA',
                       'ATTTTGACACCCCTACTACCTTTATCTGACACCCCCAAAAAACTCCTCTTAAACCCAACA',
                       'AGGGGACTATAATACCCCTAAAACTTTATCTTAACCGGAATCCGAATTCTATACCGAAAA',
                       'AACTTCTTTCCTAACCGGGATCTGTACCCCGAACTTTTAAAATTAAAGGGGAAATGAACC',
                       'CCTGACCAGATAACGGGAAACCTTTATTGTGACAGGAACTCCTACCGCAATATGAAAATT',
                       'GGACCCCAAATTTGGGAAACCCCTTTT'])


        quals = [9, 9, 6, 4, 4, 4, 4, 7, 6, 6, 8, 6, 6, 6, 7, 7, 14, 8, 8, 8, 10, 17, 21, 12, 9, 10, 10, 9, 11, 8, 9, 11, 11, 21, 12, 15, 15, 21, 24, 33, 32, 35, 29, 29, 22, 22, 15, 29, 25, 26, 18, 18, 18, 31, 31, 47, 56, 56, 56, 42, 36, 44, 28, 28, 28, 39, 33, 35, 30, 36, 33, 35, 35, 36, 35, 37, 42, 35, 35, 31, 29, 26, 26, 20, 33, 15, 22, 22, 29, 29, 32, 35, 35, 36, 35, 35, 42, 42, 37, 37, 42, 47, 47, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 44, 47, 47, 47, 47, 47, 35, 30, 30, 23, 24, 30, 45, 37, 37, 37, 35, 23, 23, 11, 11, 13, 23, 31, 21, 21, 19, 20, 23, 29, 23, 20, 16, 16, 30, 29, 29, 28, 28, 28, 24, 24, 17, 29, 29, 33, 33, 35, 31, 37, 18, 15, 12, 16, 16, 23, 27, 24, 32, 29, 32, 32, 24, 26, 29, 37, 29, 30, 35, 35, 33, 35, 35, 31, 33, 31, 31, 35, 31, 31, 31, 31, 27, 33, 33, 42, 35, 37, 37, 21, 21, 21, 21, 37, 37, 50, 50, 50, 50, 50, 33, 33, 18, 16, 15, 25, 19, 20, 33, 33, 33, 35, 35, 33, 33, 33, 18, 18, 18, 33, 24, 33, 33, 33, 27, 33, 33, 33, 33, 33, 22, 33, 33, 33, 24, 24, 21, 24, 24, 31, 31, 11, 11, 11, 31, 33, 44, 44, 37, 42, 42, 47, 44, 44, 44, 44, 44, 44, 44, 47, 50, 50, 42, 42, 42, 41, 42, 42, 47, 47, 37, 37, 27, 33, 33, 33, 33, 35, 35, 42, 41, 37, 37, 44, 50, 50, 33, 33, 27, 33, 37, 42, 42, 42, 41, 41, 33, 33, 27, 27, 33, 33, 37, 50, 35, 35, 35, 35, 35, 35, 35, 42, 35, 37, 35, 37, 35, 41, 37, 42, 42, 42, 42, 50, 50, 50, 42, 35, 33, 33, 21, 21, 16, 23, 19, 27, 27, 33, 35, 41, 50, 37, 35, 35, 42, 50, 50, 50, 44, 44, 44, 50, 42, 42, 37, 37, 35, 35, 35, 44, 44, 50, 50, 41, 37, 37, 37, 37, 35, 35, 35, 37, 37, 37, 44, 37, 37, 33, 33, 22, 33, 37, 35, 33, 33, 21, 21, 21, 33, 33, 41, 41, 44, 44, 44, 44, 44, 50, 50, 44, 44, 37, 50, 33, 33, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 50, 50, 44, 47, 44, 44, 48, 33, 21, 24, 21, 33, 33, 35, 37, 50, 50, 37, 35, 35, 50, 50, 56, 50, 50, 50, 50, 48, 33, 27, 33, 27, 33, 33, 44, 50, 50, 42, 37, 35, 42, 42, 50, 50, 50, 44, 44, 44, 33, 33, 18, 19, 18, 33, 33, 42, 35, 35, 44, 44, 44, 50, 44, 44, 44, 50, 50, 44, 44, 37, 37, 33, 33, 35, 35, 35, 35, 35, 37, 50, 37, 27, 27, 24, 37, 33, 35, 35, 37, 35, 37, 37, 46, 33, 24, 24, 21, 33, 33, 39, 42, 42, 44, 50, 50, 56, 50, 50, 37, 35, 35, 33, 37, 33, 33, 35, 35, 35, 35, 33, 33, 33, 33, 27, 27, 27, 37, 37, 44, 37, 41, 41, 41, 50, 46, 33, 24, 24, 16, 31, 19, 27, 31, 37, 37, 44, 44, 44, 37, 50, 23, 23, 22, 29, 31, 33, 23, 23, 23, 23, 23, 28, 25, 33, 26, 26, 22, 28, 37, 42, 44, 42, 42, 44, 44, 44, 44, 46, 33, 16, 19, 14, 27, 31, 42, 50, 50, 50, 44, 44, 44, 50, 50, 26, 26, 21, 28, 31, 29, 29, 26, 26, 26, 30, 30, 39, 27, 37, 26, 30, 30, 42, 42, 42, 36, 33, 29, 33, 33, 33, 20, 21, 23, 17, 23, 31, 36, 42, 43, 56, 56, 47, 47, 42, 42, 33, 33, 29, 29, 23, 31, 25, 26, 26, 26, 30, 30, 36, 27, 33, 28, 31, 33, 35, 44, 33, 33, 28, 33, 35, 44, 48, 48, 48, 42, 47, 42, 42, 42, 48, 44, 44, 37, 34, 34, 44, 48, 42, 37, 34, 42, 48, 33, 33, 34, 30, 30, 33, 33, 40, 30, 37, 28, 28, 26, 27, 27, 25, 19, 16, 25, 29, 40, 31, 27, 15, 18, 13, 25, 27, 40, 40, 33, 40, 33, 33, 33, 40, 37, 23, 12, 12, 17, 11, 10, 15, 15, 13, 13, 13, 18, 27, 23, 28, 28, 28, 28, 37, 28, 32, 26, 23, 26, 26, 19, 29, 25, 24, 25, 24, 15, 15, 15, 12, 17, 24, 24, 21, 21, 21, 25, 22, 29, 25, 22, 21, 24, 25, 17, 17, 14, 14, 12, 14, 19, 24, 18, 18, 14, 21, 11, 15, 10, 15, 18, 22, 27, 25, 25, 29, 29, 29, 25, 26, 25, 21, 22, 25, 22, 22, 18, 15, 15, 15, 25, 19, 25, 25, 16, 24, 24, 20, 20, 22, 20, 15, 10, 10, 10, 12, 13, 20, 20, 12, 14, 14, 12, 12, 12, 15, 15, 15, 18, 18, 11, 10, 11, 11, 10, 10, 14, 15, 18, 18, 19, 17, 12, 11, 10, 10, 20, 15, 19, 24, 24, 24, 23, 15, 13, 7, 6, 6, 6, 6, 6, 12, 13, 12, 9, 8, 10, 10, 9, 6, 6, 6, 6, 10, 10, 13, 15, 15, 15, 15, 17, 9, 9, 9, 9, 9, 11, 11, 9, 7, 7, 7, 6, 4, 4, 6, 9, 9, 8, 8, 8, 10, 9, 8, 7, 7, 7, 7, 7, 9, 13, 10, 10, 10, 15, 12, 9, 9, 9, 15, 19, 15, 15, 11, 7, 7, 7, 7, 7, 7, 8, 8, 19, 10, 10, 10, 12, 12, 19, 11, 15, 18, 11, 14, 9, 9, 6, 6, 6, 6, 6, 6, 6, 8, 11, 20, 13, 17, 14, 14, 9, 9, 10, 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 11, 10, 12, 11, 10, 12, 9, 12, 8, 8, 8, 9, 12, 12, 8, 11, 7, 8, 8, 8, 8, 11, 9, 8, 6, 4, 4, 4, 6, 6, 7, 10, 10, 12, 9, 7, 7, 6, 6, 6, 6, 8, 6, 9, 10, 13, 8, 11, 8, 7, 7, 8, 7, 7, 7, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 14, 10, 8, 12, 8, 8, 6, 7, 9, 8, 7, 6, 8, 7, 4, 4, 7, 7, 6, 7, 6, 6, 6, 6, 8, 11, 8, 8, 8, 8, 12, 10, 12, 11, 11, 11, 10, 12, 10, 7, 7, 9, 4, 4, 8, 6, 6, 6, 6, 6, 6, 7, 10, 7, 7, 7, 7, 7, 9, 9, 9, 7, 7, 7, 6, 6, 6, 7, 7, 7, 10, 11, 9, 7, 6, 6, 8, 6, 6, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 9, 12, 10, 15, 15, 16, 7, 7, 6, 6, 6, 7, 6, 6, 6, 6, 6, 8, 7, 7, 8, 7, 8, 7, 7, 9, 8, 7, 7, 8, 8, 9, 7, 6, 7, 6, 9, 6, 7, 11, 7, 7, 11, 8, 8, 7, 10, 8, 9, 8, 6, 6, 6, 6, 7, 7, 7, 6, 6, 6, 8, 8, 7, 7, 6, 9, 7, 6, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 10, 12, 19, 13, 13, 10, 9, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 6, 6, 6, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 6, 6, 6, 7, 7, 7, 7, 8, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 11, 12, 9]

        e.seq = fastn.Fasta(e.id, seq)
        e.seq = e.seq.to_Fastq(quals)
        e.insert_min = 2000
        e.insert_max = 4000
        e.ligation = '96781'
        e.clone = 'pknbac5b2'
        e.clip_start = 33
        e.clip_end = 848
        self.assertEqual(c, e)

        utils.close(f_in)
        genome_intervals.merge_overlapping_in_list(hits)

        for hit in hits:
            if hit.end - hit.start + 1 >= options.min_seq_length:
                if ref_name not in contigs_to_print:
                    contigs_to_print[ref_name] = []

                contigs_to_print[ref_name].append(copy.copy(hit))

# remove any contigs that are completely contained in another contig
for ref, l in contigs_to_print.items():
    genome_intervals.remove_contained_in_list(l)

# print the final perfect contigs
f_out = utils.open_file_write(options.outprefix + '.fa')
counter = 1
last_id = None
for ref_name in sorted(contigs_to_print):
    counter = 1

    for interval in contigs_to_print[ref_name]:
        id = ':'.join([
            str(x) for x in [ref_name, counter, interval.start, interval.end]
        ])
        print(fastn.Fasta(id,
                          ref_seqs[ref_name][interval.start - 1:interval.end]),
              file=f_out)
        counter += 1

utils.close(f_out)
Esempio n. 23
0
 def test_to_Fasta_and_qual(self):
     '''Check to_Fasta_and_qual converts quality scores correctly'''
     fq = fastn.Fastq('ID', 'ACGT', '>ADI')
     (fa, qual) = fq.to_Fasta_and_qual()
     self.assertEqual(fa, fastn.Fasta('ID', 'ACGT'))
     self.assertListEqual(qual, [29, 32, 35, 40])
Esempio n. 24
0
gc_hist = dict(zip(range(101), [0] * 101))

f_in = mh12_utils.open_file_read(options.infile)
f_out = mh12_utils.open_file_write(options.txtout)

while 1:
    seq = fastn.get_next_seq_from_file(f_in, filetype)

    if not seq:
        break

    if options.window:
        i = 0

        while i < len(seq):
            tmp = fastn.Fasta(seq.id, seq.seq[i:i + options.window])
            gc = tmp.gc()
            gc_hist[floor(gc)] += 1
            print >> f_out, seq.id, str(i + 1), gc
            i += options.window
    else:
        gc = seq.gc()
        gc_hist[floor(gc)] += 1
        print >> f_out, seq.id, gc

f_in.close()
f_out.close()

c = not options.noclean

mh12_utils.hist2Rplot(gc_hist,
def get_unique_tags(ref_index,
                    tag_length,
                    unique_tagged_seqs,
                    untagged_seqs,
                    unique_tags,
                    log_fh,
                    second_index=None):
    if len(untagged_seqs) == 0:
        return
    tags_fasta_fname = options.outprefix + '.tags.test.fa'
    seqs_sam = options.outprefix + '.seqs.bowtie2.sam'
    second_seqs_sam = options.outprefix + '.second_seqs.bowtie2.sam'
    fout_tags = utils.open_file_write(tags_fasta_fname)
    tags = {}
    tag_info = {}

    # make fasta file of tags
    for id, seq in untagged_seqs.items():
        tag = ''
        if len(seq) < tag_length:
            tag = fastn.Fasta(seq.id + ':1-' + str(len(seq)), seq.seq)
            tag_info[id] = [id, '1', str(len(seq)), tag.seq]
        else:
            left_coord = int(0.5 * len(seq) - 0.5 * tag_length)
            right_coord = left_coord + tag_length - 1
            tag = fastn.Fasta(
                seq.id + ':' + str(left_coord + 1) + '-' +
                str(right_coord + 1), seq[left_coord:right_coord + 1])
            tag_info[id] = [id, left_coord + 1, right_coord + 1, tag.seq]

        print(tag, file=fout_tags)
        tags[id] = copy.copy(seq)

    utils.close(fout_tags)

    # get the count of number of hits per tag from the results
    tag_counts = {}
    second_tag_counts = {}
    map_and_parse_sam(ref_index, tags_fasta_fname, tag_counts, log_fh)

    if second_index:
        map_and_parse_sam(second_index, tags_fasta_fname, second_tag_counts,
                          log_fh)
        assert len(tag_counts) == len(second_tag_counts)

    # update the unique/non-unique tagged sequences
    for contig_name, hit_count in tag_counts.items():
        assert contig_name not in unique_tagged_seqs
        if second_index:
            second_hit_count = second_tag_counts[contig_name]
        else:
            second_hit_count = 1
        if hit_count == 1 == second_hit_count:
            unique_tagged_seqs[contig_name] = tags[contig_name]
            unique_tags.append(tag_info[contig_name])
            del untagged_seqs[contig_name]

    try:
        os.unlink(tags_fasta_fname)
    except:
        print('Error deleting file "' + tags_fasta_fname + '"',
              file=sys.stderr)
        sys.exit(1)
Esempio n. 26
0
 def test_to_Fastq(self):
     '''Check to_Fastq converts OK, including out of range quality scores'''
     fa = fastn.Fasta('X', 'AAAAA')
     quals = [-1, 0, 40, 93, 94]
     self.assertEqual(fastn.Fastq('X', 'AAAAA', '!!I~~'),
                      fa.to_Fastq(quals))
Esempio n. 27
0
 def to_fasta(self, id):
     seq = ''.join([self.bases[i][self.positions[i]] for i in range(len(self.positions))])
     return fastn.Fasta(id, seq)
Esempio n. 28
0
 def test_replace_bases(self):
     '''Check that bases get replaced correctly'''
     fa = fastn.Fasta('X', 'AUCGTUUACT')
     fa.replace_bases('U', 'T')
     self.assertEqual(fa, fastn.Fasta('X', 'ATCGTTTACT'))
 def to_fasta(self):
     return fastn.Fasta(self.id, self.seq)
Esempio n. 30
0
 def setUp(self):
     self.fasta = fastn.Fasta('ID', 'ACGTA')