Example #1
0
    def test_no_real_bases_in_sample(self):
        alignment1 = makeTempFasta([('s1', 'AA'), ('s2', '--'),])
        cm = interhost.CoordMapper()
        with self.assertRaises(Exception):
            cm.load_alignments([alignment1])

        alignment2 = makeTempFasta([('s1', '--'), ('s2', 'AA'), ('s3', 'TT'),])
        cm = interhost.CoordMapper()
        with self.assertRaises(Exception):
            cm.load_alignments([alignment2])
Example #2
0
 def test_adjacent_gaps(self):
     alignment = makeTempFasta([
         ('s1', 'ATCTG'),
         ('s2', 'AC--G'),
         ('s3', 'A-TTG'),
         ('s4', 'A-C-G'),
         ('s5', 'A--CG'),
     ])
     cm = interhost.CoordMapper()
     cm.load_alignments([alignment])
     for x, y in ((1, 1), (2, 2), (3, 2), (4, 2), (5, 3)):
         self.assertEqual(cm.mapChr('s1', 's2', x), ('s2', y))
     for x, y in ((1, 1), (2, [2, 4]), (3, 5)):
         self.assertEqual(cm.mapChr('s2', 's1', x), ('s1', y))
     for x, y in ((1, 1), (2, 1), (3, 2), (4, 3), (5, 4)):
         self.assertEqual(cm.mapChr('s1', 's3', x), ('s3', y))
     for x, y in ((1, [1, 2]), (2, 3), (3, 4), (4, 5)):
         self.assertEqual(cm.mapChr('s3', 's1', x), ('s1', y))
     for x, y in ((1, 1), (2, [2, 3]), (3, 4)):
         self.assertEqual(cm.mapChr('s2', 's3', x), ('s3', y))
     for x, y in ((1, 1), (2, 2), (3, 2), (4, 3)):
         self.assertEqual(cm.mapChr('s3', 's2', x), ('s2', y))
     for a, b in itertools.combinations(('s2', 's4', 's5'), 2):
         for i in (1, 2, 3):
             self.assertEqual(cm.mapChr(a, b, i), (b, i))
             self.assertEqual(cm.mapChr(b, a, i), (a, i))
Example #3
0
 def test_one_real_base(self):
     alignment = makeTempFasta([('s1', 'AC-'), ('s2', '-CA'),])
     cm = interhost.CoordMapper()
     cm.load_alignments([alignment])
     self.assertEqual(cm.mapChr('s1', 's2', 1), ('s2', None))
     self.assertEqual(cm.mapChr('s1', 's2', 2), ('s2', 1))
     self.assertEqual(cm.mapChr('s2', 's1', 1), ('s1', 2))
     self.assertEqual(cm.mapChr('s2', 's1', 2), ('s1', None))
Example #4
0
 def test_multiple_input_genomes(self):
     genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'),])
     genomeB = makeTempFasta([('first_chr', 'ATGCACTACGTATGCAAATCGG')])
     genomeC = makeTempFasta([('chr_one', 'ATGCACGTACGTATGCAATCGG')])
     cm = interhost.CoordMapper()
     cm.align_and_load_sequences([genomeA, genomeB, genomeC])
     # check that toChrom is in the map
     self.assertEqual(cm.mapChr('chr1', 'chr_one'), 'chr_one')
Example #5
0
 def test_unequal_len(self):
     alignment = makeTempFasta([
         ('s1', 'AA'),
         ('s2', 'A'),
     ])
     cm = interhost.CoordMapper()
     with self.assertRaises(Exception):
         cm.load_alignments([alignment])
Example #6
0
 def test_basic_alignment(self):
     alignment = makeTempFasta([
         ('s1', 'ATCG'),
         ('s2', 'ACCG'),
         ('s3', 'AG-T'),
     ])
     cm = interhost.CoordMapper()
     cm.load_alignments([alignment])
Example #7
0
 def test_single_chr_error(self):
     genomeA = makeTempFasta([
         ('chr1', 'ATGCACGTACGTATGCAAATCGG'),
     ])
     genomeB = makeTempFasta([])
     with self.assertRaises(Exception):
         cm = interhost.CoordMapper()
         cm.align_and_load_sequences([genomeA, genomeB])
Example #8
0
def tbl_transfer(ref_fasta, ref_tbl, alt_fasta, out_tbl, oob_clip=False):
    ''' This function takes an NCBI TBL file describing features on a genome
        (genes, etc) and transfers them to a new genome.
    '''
    cmap = interhost.CoordMapper()
    cmap.align_and_load_sequences([ref_fasta, alt_fasta])
    alt_chrlens = fasta_chrlens(alt_fasta)

    tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip)
Example #9
0
 def test_unequal_genomes_error(self):
     genomeA = makeTempFasta([
         ('chr1', 'ATGCACGTACGTATGCAAATCGG'),
         ('chr2', 'AGTCGGTTTTCAG'),
     ])
     genomeB = makeTempFasta([('first_chrom', 'GCACGTACGTATTTGCAAATC')])
     with self.assertRaises(Exception):
         cm = interhost.CoordMapper()
         cm.align_and_load_sequences([genomeA, genomeB])
Example #10
0
 def test_exactly_two_pairs(self):
     alignment = makeTempFasta([('s1', 'A--T'), ('s2', 'AGGT'),])
     cm = interhost.CoordMapper()
     cm.load_alignments([alignment])
     self.assertEqual(cm.mapChr('s1', 's2', 1), ('s2', [1, 3]))
     self.assertEqual(cm.mapChr('s1', 's2', 2), ('s2', 4))
     self.assertEqual(cm.mapChr('s2', 's1', 1), ('s1', 1))
     self.assertEqual(cm.mapChr('s2', 's1', 2), ('s1', 1))
     self.assertEqual(cm.mapChr('s2', 's1', 3), ('s1', 1))
     self.assertEqual(cm.mapChr('s2', 's1', 4), ('s1', 2))
Example #11
0
 def test_no_real_bases_at_position(self):
     alignment = makeTempFasta([('s1', 'AT-G'), ('s2', 'AC-G'), ('s3', 'AG-T'),])
     cm = interhost.CoordMapper()
     cm.load_alignments([alignment])
     for i in (1, 2, 3):
         self.assertEqual(cm.mapChr('s1', 's2', i), ('s2', i))
         self.assertEqual(cm.mapChr('s2', 's1', i), ('s1', i))
         self.assertEqual(cm.mapChr('s1', 's3', i), ('s3', i))
         self.assertEqual(cm.mapChr('s3', 's1', i), ('s1', i))
         self.assertEqual(cm.mapChr('s2', 's3', i), ('s3', i))
         self.assertEqual(cm.mapChr('s3', 's2', i), ('s2', i))
Example #12
0
 def setUp(self):
     super(TestCoordMapper, self).setUp()
     self.genomeA = makeTempFasta([
         ('chr1', 'ATGCACGTACGTATGCAAATCGG'),
         ('chr2', 'AGTCGGTTTTCAG'),
     ])
     self.genomeB = makeTempFasta([
         ('first_chrom', 'GCACGTACGTATTTGCAAATC'),
         ('second_chr', 'AGTCGGTTTCCAC'),
     ])
     self.cm = interhost.CoordMapper()
     self.cm.align_and_load_sequences([self.genomeA, self.genomeB])
Example #13
0
 def test_aligned_gaps(self):
     alignment = makeTempFasta([('s1', 'ATCG'), ('s2', 'AC-G'), ('s3', 'AG-T'),])
     cm = interhost.CoordMapper()
     cm.load_alignments([alignment])
     for i in (1, 2, 3):
         self.assertEqual(cm.mapChr('s2', 's3', i), ('s3', i))
         self.assertEqual(cm.mapChr('s3', 's2', i), ('s2', i))
     for x, y in ((1, 1), (2, 2), (3, 2), (4, 3)):
         self.assertEqual(cm.mapChr('s1', 's2', x), ('s2', y))
         self.assertEqual(cm.mapChr('s1', 's3', x), ('s3', y))
     for x, y in ((1, 1), (2, [2, 3]), (3, 4)):
         self.assertEqual(cm.mapChr('s2', 's1', x), ('s1', y))
         self.assertEqual(cm.mapChr('s3', 's1', x), ('s1', y))
Example #14
0
def tbl_transfer_prealigned(inputFasta,
                            refFasta,
                            refAnnotTblFiles,
                            outputDir,
                            oob_clip=False):
    """
        This breaks out the ref and alt sequences into separate fasta files, and then
        creates unified files containing the reference sequence first and the alt second. Each of these unified files
        is then passed as a cmap to tbl_transfer_common.

        This function expects to receive one fasta file containing a multialignment of a single segment/chromosome along
        with the respective reference sequence for that segment/chromosome. It also expects a reference containing all
        reference segments/chromosomes, so that the reference sequence can be identified in the input file by name. It
        also expects a list of reference tbl files, where each file is named according to the ID present for its
        corresponding sequence in the refFasta. For each non-reference sequence present in the inputFasta, two files are
        written: a fasta containing the segment/chromosome for the same, along with its corresponding feature table as
        created by tbl_transfer_common.
    """

    ref_tbl = ""  # must be identified in list of tables
    ref_fasta_filename = ""
    matchingRefSeq = None

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    # identify which of the sequences in the multialignment is the reference,
    # matching by ID to one of the sequences in the refFasta
    with util.file.open_or_gzopen(inputFasta, 'r') as inf:
        for seq in Bio.SeqIO.parse(inf, 'fasta'):
            with util.file.open_or_gzopen(refFasta, 'r') as reff:
                for refSeq in Bio.SeqIO.parse(reff, 'fasta'):
                    if seq.id == refSeq.id:
                        ref_fasta_filename = util.file.mkstempfname('.fasta')
                        matchingRefSeq = seq
                        break
            if matchingRefSeq:
                break

    if ref_fasta_filename == "":
        raise KeyError("No reference was found in the input file %s" %
                       (inputFasta))

    # identify the correct feature table source based on its filename,
    # which should correspond to a unique component of the ref sequence ID (i.e. the genbank accession)
    for tblFilename in refAnnotTblFiles:
        # identify the correct feature table as the one that has an ID that is
        # part of the ref seq ID
        fileAccession = util.genbank.get_feature_table_id(tblFilename)
        if fileAccession in matchingRefSeq.id:
            ref_tbl = tblFilename
            break
    if ref_tbl == "":
        raise KeyError("No reference table was found for the reference %s" %
                       (matchingRefSeq.id))

    # write out the desired sequences to separate fasta files
    with util.file.open_or_gzopen(inputFasta, 'r') as inf:
        for seq in Bio.SeqIO.parse(inf, 'fasta'):
            # if we are looking at the reference sequence in the multialignment,
            # continue to the next sequence
            if seq.id == matchingRefSeq.id:
                continue

            combined_fasta_filename = ""

            combined_fasta_filename = util.file.mkstempfname('.fasta')
            # write ref and alt sequences to a combined fasta file, sourced from the
            # alignment so gaps are present for the CoordMapper instance, cmap
            with open(combined_fasta_filename, 'wt') as outf:
                Bio.SeqIO.write([matchingRefSeq, seq], outf, "fasta")

            # create a filepath for the output table
            out_tbl = os.path.join(outputDir, seq.id + ".tbl")

            cmap = interhost.CoordMapper()
            cmap.load_alignments([combined_fasta_filename])
            # sequences in the fasta file here should NOT include gaps
            # since alt_chrlens is only used in the case where features would
            # extend beyond the genome (for reporting >{seq.len})
            alt_chrlens = {}  #fasta_chrlens(combined_fasta_filename)
            alt_chrlens[seq.id] = len(seq.seq.ungap("-"))
            alt_chrlens[matchingRefSeq.id] = len(matchingRefSeq.seq.ungap("-"))

            tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip)