Beispiel #1
0
def test_mutate_indel():

    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import generic_dna
    from common_object import Boundary, Variant

    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))

    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 2, 3, 'indel', 'ins', 'T', 'TAC')

    reg.variants.append(var)

    trueSeq = mutate_indel(reg, var, seq)

    assert (str(trueSeq) == 'CCTACGGTGCTC')
    '''-------------'''
    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))

    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 12, 13, 'indel', 'ins', 'T', 'TAC')

    reg.variants.append(var)

    trueSeq = mutate_indel(reg, var, seq)

    assert (str(trueSeq) == 'CCTGGTGCTC')
    '''---------------'''
    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))
    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 2, 2, 'indel', 'ins', 'T', ['TAC', 'A'])

    reg.variants.append(var)

    trueSeq = mutate_indel(reg, var, seq)

    assert (str(trueSeq) == 'CCTACGGTGCTC')
    '''----------------'''
    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))
    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 2, 3, 'indel', 'del', 'TG', 'T')

    reg.variants.append(var)
    trueSeq = mutate_indel(reg, var, seq)

    assert (str(trueSeq) == 'CCTGTGCTC')
Beispiel #2
0
def test_mutate_sv():
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import generic_dna
    from common_object import Boundary, Variant

    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))

    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 1, 5, 'sv', 'DEL', '', '')

    # seq = CCTGGTGCTC, NNCTGCTCNN
    reg.variants.append(var)
    trueSeq = mutate_sv(reg, var, seq)
    assert (str(trueSeq) == 'CTGCTC')
    '''-----------------------------'''
    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))
    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 0, 6, 'sv', 'INV', '', '')

    reg.variants.append(var)
    trueSeq = mutate_sv(reg, var, seq)
    assert (str(trueSeq) == 'TGGTCCGCTC')
    '''-----------------------------'''
    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))
    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 0, 3, 'sv', 'DUP', '', '')

    reg.variants.append(var)
    trueSeq = mutate_sv(reg, var, seq)
    assert (str(trueSeq) == 'CCTCCTGGTGCTC')
    '''-----------------------------'''
    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))
    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 0, 3, 'sv', 'CNV', '', alt='[3,3]')

    reg.variants.append(var)
    trueSeq = mutate_sv(reg, var, seq)
    assert (str(trueSeq) == 'CCTCCTCCTGGTGCTC')
Beispiel #3
0
def get_ssm_bedfile(ssm_file: str, cohort: str = 'Eric_CLL'):
    """Obtains and parses the simple somatic mutation file bed file

    Parameters
    ----------
    ssm_file : the simple somatic mutation file in a bed file format
    cohort : a field in the field labeled as cohort

    Returns
    -------
    variants: a dictionary of list of Variant objects

    """
    cohort = cohort.replace('Eric_', '')

    variants = {}
    count = 0
    with open(ssm_file) as fi:
        for ln in fi.readlines():
            st = re.split('[\t\n]+', ln)

            # print('line:', ln)
            # print('st:', st)

            if st[6] != cohort:
                continue

            chrom = st[0]
            start = int(st[1])
            end = int(st[2])
            sample_id = st[3]

            if sample_id not in variants:
                variants[sample_id] = []

            ref = st[4]
            alt = st[5]
            vt = 'snp'
            svtype = None
            gt = '1|1'
            count += 1
            var = Variant(chrom, start, end, vt, svtype, ref, alt, gt)

            variants[sample_id].append(var)

    print('number of samples:{}, number of mutation:{}, mutations/sample:{}'.
          format(len(variants), count,
                 float(count) / len(variants)))

    return variants
Beispiel #4
0
def test_mutate_snp():

    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import generic_dna
    from common_object import Boundary, Variant

    seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna))

    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 2, 2, 'snp', '', 'T', 'A')

    reg.variants.append(var)
    true_seq = mutate_snp(reg, var, seq)

    assert str(true_seq) == 'CCAGGTGCTC'

    reg = Boundary('chr1', 0, 10)
    var = Variant('chr1', 1, 1, 'snp', '', 'C', 'AT')

    reg.variants.append(var)
    true_seq = mutate_snp(reg, var, seq)

    assert str(true_seq) == 'CATTGGTGCTC'
Beispiel #5
0
def getSSM_bedfile(ssm_file, cohort='Eric_CLL'):

    cohort = cohort.replace('Eric_','')

    variants = {}
    count = 0
    with open(ssm_file) as fi:
        for ln in fi.readlines():
            st = re.split('[\t\n]+',ln)

            # print('line:', ln)
            # print('st:', st)

            if st[6] != cohort:
                continue


            chrom = st[0]
            start = int(st[1])
            end = int(st[2])
            sample_id = st[3]

            if sample_id not in variants:
                variants[sample_id] = []

            ref = st[4]
            alt = st[5]
            vt = 'snp'
            svtype = None
            gt = '1|1'
            count += 1
            var = Variant(chrom, start, end, vt, svtype, ref, alt, gt)

            variants[sample_id].append(var)

    print('number of samples:{}, number of mutation:{}, mutations/sample:{}'.format(len(variants), count,
                                                                                    float(count)/len(variants)))
    return variants
Beispiel #6
0
def getSV(stvm_file):
    variants = {}

    processed_sv = set()

    with open(stvm_file, 'r') as fin:
        ln = fin.readline()
        fields = re.split('\t', ln)
        field2Id = {}
        for i in range(len(fields)):
            field2Id[fields[i]] = i

        sv_header_id = field2Id['sv_id']
        variant_type_id = field2Id['variant_type']
        chr_from_id = field2Id['chr_from']
        chr_to_id = field2Id['chr_to']
        icgc_sample_id = field2Id['icgc_sample_id']
        chr_from_bkpt_id = field2Id['chr_from_bkpt']
        chr_to_bkpt_id = field2Id['chr_to_bkpt']

        for ln in fin.readlines():
            st = re.split('\t', ln)

            sv_id = st[sv_header_id]

            if sv_id in processed_sv:
                continue

            processed_sv.add(sv_id)

            svtype = st[variant_type_id]

            #        if svtype == 'unbalanced translocation':
            #            continue

            chrid_from = st[chr_from_id].upper()
            chrid_to = st[chr_to_id].upper()

            # ignore inter-chromosome SV
            if (chrid_from != chrid_to) or (not re.search('^[0-9XY]+', chrid_from)) \
                    or (not re.search('^[0-9XY]+', chrid_to)):
                continue

            sample_id = st[icgc_sample_id]
            if sample_id not in variants:
                variants[sample_id] = []

            chrom = 'chr' + chrid_from
            start = int(st[chr_from_bkpt_id]) - 1
            end = int(st[chr_to_bkpt_id])

            if svtype == 'deletion':
                svtype = 'DEL'
            elif svtype == 'inversion':
                svtype = 'INV'
            elif svtype == 'tandem duplication':
                svtype = 'DUP'
            else:
                sys.stderr.write('wrong svtype:{}\n'.format(svtype))


            ref = ''
            alt = ''
            gt = '1|1'
            vt = 'sv'

            var = Variant(chrom, start, end, vt, svtype, ref, alt, gt)
            variants[sample_id].append(var)

    for sample, vars in variants.items():
        for i in range(len(vars) - 1):
            if vars[i].chrid == vars[i + 1].chrid and vars[i].start == vars[i + 1].start\
                and vars[i].end == vars[i + 1].end:

                print('duplicate variant, sample:{}, variants:{}'.format(sample, str(vars[i])))

    print('Number of SV samples:', len(variants))
    return variants
Beispiel #7
0
def getSSM(ssm_file):
    variants = {}  # icgc_sample_id: variants
    processed_mut = set()  # processsed mutation to handle duplicate records

    # for i in range(len(ssm)):
    with open(ssm_file, 'r') as fin:

        ln = fin.readline()

        fields = re.split('\t', ln)
        field2Id = {}
        for i in range(len(fields)):
            field2Id[fields[i]] = i

        icgc_mutation_id = field2Id['icgc_mutation_id']
        chromosome_id = field2Id['chromosome']
        chromosome_start_id = field2Id['chromosome_start']
        chromosome_end_id = field2Id['chromosome_end']
        mutation_type_id = field2Id['mutation_type']
        reference_genome_allele_id = field2Id['reference_genome_allele']

        if 'tumour_genotype' in fields:
            tumour_genotype_id = field2Id['tumour_genotype']
        else:
            mutated_to_allele_id = field2Id['mutated_to_allele']

        icgc_sample_id = field2Id['icgc_sample_id']

        for ln in fin.readlines():

            st = re.split('\t', ln)

            mut_id = st[icgc_mutation_id]

            if mut_id in processed_mut:
                #print(mut_id)
                continue

            processed_mut.add(mut_id)

            chrid = str(st[chromosome_id]).upper()
            if not re.search('^[0-9XY]+', chrid):
                print(chrid)
                continue

            start = int(st[chromosome_start_id]) - 1
            end = int(st[chromosome_end_id])
            vt = st[mutation_type_id]

            ref = st[reference_genome_allele_id]

            if 'tumour_genotype' in fields:
                # control_gt = ssm.loc[i, 'control_genotype']
                tumor_gt = st[tumour_genotype_id]
            else:
                tumor_gt = st[mutated_to_allele_id]

            sample_id = st[icgc_sample_id]

            if sample_id not in variants:
                variants[sample_id] = []

            '''
            mutating by replacing ref. with alt.
            if alt == '', in insertation, it means no insertation ( if ref == '')
                          in deletion, it means deleting ref

            if insertion, ref is always -, must be converted to ''
            '''
            # Variant(sample, rc.CHROM, start, end, dvt, dsvtype, rc.REF, rc.ALT, gt )

            ref = '' if ref == '-' else ref

            if re.search('substitution', vt):
                vt = 'snp'
                svtype = ''
            elif re.search('deletion', vt):
                vt = 'indel'
                svtype = 'del'
            elif re.search('insertion', vt):
                vt = 'indel'
                svtype = 'ins'

            # alternative
            alt = re.split('[|/]', tumor_gt)
            alt = [x if x != '-' else '' for x in alt]  # convert '-' to empty

            if len(alt) == 1:  # if there is only one allele, make another from it
                alt.append(alt[0])

            # 0: for reference seq, therefore + 1
            # if insertion, gt = '' can be 0
            gt = '|'.join([str(alt.index(x) + 1) for x in alt])

            if vt == 'snp' and ref == '':
                print('error, snp but ref. is not available')

            #print('chrom:{}, start:{}, end:{}, vt:{}, subtype:{}, ref:{}, alt:{}'.format(chrid, start, end, vt, svtype, ref, alt))

            var = Variant('chr' + chrid, start, end, vt, svtype, ref, alt, gt)
            variants[sample_id].append(var)

    print('Number of sample:{}, number of variants{}'.format(len(variants), len(processed_mut)))
    return variants
Beispiel #8
0
def get_sv(stvm_file: str) -> Dict:
    """Obtains and parses the structural variant file from ICGC

    Parameters
    ----------
    stvm_file : the structural variant file from ICGC in a tsv file format

    Returns
    -------
    variants: a dictionary of list of Variant objects

    """

    variants = {}

    processed_sv = set()

    with open(stvm_file, 'r') as fin:
        ln = fin.readline()
        fields = re.split('\t', ln)
        field2_id = {}

        # for i in range(len(fields)):
        #    field2Id[fields[i]] = i

        # enumerate approach
        for i, field in enumerate(fields):
            field2_id[field] = i

        sv_header_id = field2_id['sv_id']
        variant_type_id = field2_id['variant_type']
        chr_from_id = field2_id['chr_from']
        chr_to_id = field2_id['chr_to']
        icgc_sample_id = field2_id['icgc_sample_id']
        chr_from_bkpt_id = field2_id['chr_from_bkpt']
        chr_to_bkpt_id = field2_id['chr_to_bkpt']

        for ln in fin.readlines():
            st = re.split('\t', ln)

            sv_id = st[sv_header_id]

            if sv_id in processed_sv:
                continue

            processed_sv.add(sv_id)

            svtype = st[variant_type_id]

            #        if svtype == 'unbalanced translocation':
            #            continue

            chrid_from = st[chr_from_id].upper()
            chrid_to = st[chr_to_id].upper()

            # ignore inter-chromosome SV
            if (chrid_from != chrid_to) or (
                    not re.search('^[0-9XY]+', chrid_from)) or \
                    (not re.search('^[0-9XY]+', chrid_to)):
                continue

            sample_id = st[icgc_sample_id]
            if sample_id not in variants:
                variants[sample_id] = []

            chrom = 'chr' + chrid_from
            start = int(st[chr_from_bkpt_id]) - 1
            end = int(st[chr_to_bkpt_id])

            if svtype == 'deletion':
                svtype = 'DEL'
            elif svtype == 'inversion':
                svtype = 'INV'
            elif svtype == 'tandem duplication':
                svtype = 'DUP'
            else:
                sys.stderr.write('wrong svtype:{}\n'.format(svtype))

            ref = ''
            alt = ''
            gt = '1|1'
            vt = 'sv'

            var = Variant(chrom, start, end, vt, svtype, ref, alt, gt)
            variants[sample_id].append(var)

    for sample, varts in variants.items():
        for i in range(len(varts) - 1):
            if varts[i].chrid == varts[i + 1].chrid and \
                    varts[i].start == varts[i + 1].start and \
                    varts[i].end == varts[i + 1].end:
                print('duplicate variant, sample:{}, variants:{}'.format(
                    sample, str(varts[i])))

    print('Number of SV samples:', len(variants))
    return variants