def test_mutate_indel(): from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_dna from common_object import Boundary, Variant seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 2, 3, 'indel', 'ins', 'T', 'TAC') reg.variants.append(var) trueSeq = mutate_indel(reg, var, seq) assert (str(trueSeq) == 'CCTACGGTGCTC') '''-------------''' seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 12, 13, 'indel', 'ins', 'T', 'TAC') reg.variants.append(var) trueSeq = mutate_indel(reg, var, seq) assert (str(trueSeq) == 'CCTGGTGCTC') '''---------------''' seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 2, 2, 'indel', 'ins', 'T', ['TAC', 'A']) reg.variants.append(var) trueSeq = mutate_indel(reg, var, seq) assert (str(trueSeq) == 'CCTACGGTGCTC') '''----------------''' seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 2, 3, 'indel', 'del', 'TG', 'T') reg.variants.append(var) trueSeq = mutate_indel(reg, var, seq) assert (str(trueSeq) == 'CCTGTGCTC')
def test_mutate_sv(): from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_dna from common_object import Boundary, Variant seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 1, 5, 'sv', 'DEL', '', '') # seq = CCTGGTGCTC, NNCTGCTCNN reg.variants.append(var) trueSeq = mutate_sv(reg, var, seq) assert (str(trueSeq) == 'CTGCTC') '''-----------------------------''' seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 0, 6, 'sv', 'INV', '', '') reg.variants.append(var) trueSeq = mutate_sv(reg, var, seq) assert (str(trueSeq) == 'TGGTCCGCTC') '''-----------------------------''' seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 0, 3, 'sv', 'DUP', '', '') reg.variants.append(var) trueSeq = mutate_sv(reg, var, seq) assert (str(trueSeq) == 'CCTCCTGGTGCTC') '''-----------------------------''' seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 0, 3, 'sv', 'CNV', '', alt='[3,3]') reg.variants.append(var) trueSeq = mutate_sv(reg, var, seq) assert (str(trueSeq) == 'CCTCCTCCTGGTGCTC')
def get_ssm_bedfile(ssm_file: str, cohort: str = 'Eric_CLL'): """Obtains and parses the simple somatic mutation file bed file Parameters ---------- ssm_file : the simple somatic mutation file in a bed file format cohort : a field in the field labeled as cohort Returns ------- variants: a dictionary of list of Variant objects """ cohort = cohort.replace('Eric_', '') variants = {} count = 0 with open(ssm_file) as fi: for ln in fi.readlines(): st = re.split('[\t\n]+', ln) # print('line:', ln) # print('st:', st) if st[6] != cohort: continue chrom = st[0] start = int(st[1]) end = int(st[2]) sample_id = st[3] if sample_id not in variants: variants[sample_id] = [] ref = st[4] alt = st[5] vt = 'snp' svtype = None gt = '1|1' count += 1 var = Variant(chrom, start, end, vt, svtype, ref, alt, gt) variants[sample_id].append(var) print('number of samples:{}, number of mutation:{}, mutations/sample:{}'. format(len(variants), count, float(count) / len(variants))) return variants
def test_mutate_snp(): from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_dna from common_object import Boundary, Variant seq = SeqRecord(Seq('CCTGGTGCTC', generic_dna)) reg = Boundary('chr1', 0, 10) var = Variant('chr1', 2, 2, 'snp', '', 'T', 'A') reg.variants.append(var) true_seq = mutate_snp(reg, var, seq) assert str(true_seq) == 'CCAGGTGCTC' reg = Boundary('chr1', 0, 10) var = Variant('chr1', 1, 1, 'snp', '', 'C', 'AT') reg.variants.append(var) true_seq = mutate_snp(reg, var, seq) assert str(true_seq) == 'CATTGGTGCTC'
def getSSM_bedfile(ssm_file, cohort='Eric_CLL'): cohort = cohort.replace('Eric_','') variants = {} count = 0 with open(ssm_file) as fi: for ln in fi.readlines(): st = re.split('[\t\n]+',ln) # print('line:', ln) # print('st:', st) if st[6] != cohort: continue chrom = st[0] start = int(st[1]) end = int(st[2]) sample_id = st[3] if sample_id not in variants: variants[sample_id] = [] ref = st[4] alt = st[5] vt = 'snp' svtype = None gt = '1|1' count += 1 var = Variant(chrom, start, end, vt, svtype, ref, alt, gt) variants[sample_id].append(var) print('number of samples:{}, number of mutation:{}, mutations/sample:{}'.format(len(variants), count, float(count)/len(variants))) return variants
def getSV(stvm_file): variants = {} processed_sv = set() with open(stvm_file, 'r') as fin: ln = fin.readline() fields = re.split('\t', ln) field2Id = {} for i in range(len(fields)): field2Id[fields[i]] = i sv_header_id = field2Id['sv_id'] variant_type_id = field2Id['variant_type'] chr_from_id = field2Id['chr_from'] chr_to_id = field2Id['chr_to'] icgc_sample_id = field2Id['icgc_sample_id'] chr_from_bkpt_id = field2Id['chr_from_bkpt'] chr_to_bkpt_id = field2Id['chr_to_bkpt'] for ln in fin.readlines(): st = re.split('\t', ln) sv_id = st[sv_header_id] if sv_id in processed_sv: continue processed_sv.add(sv_id) svtype = st[variant_type_id] # if svtype == 'unbalanced translocation': # continue chrid_from = st[chr_from_id].upper() chrid_to = st[chr_to_id].upper() # ignore inter-chromosome SV if (chrid_from != chrid_to) or (not re.search('^[0-9XY]+', chrid_from)) \ or (not re.search('^[0-9XY]+', chrid_to)): continue sample_id = st[icgc_sample_id] if sample_id not in variants: variants[sample_id] = [] chrom = 'chr' + chrid_from start = int(st[chr_from_bkpt_id]) - 1 end = int(st[chr_to_bkpt_id]) if svtype == 'deletion': svtype = 'DEL' elif svtype == 'inversion': svtype = 'INV' elif svtype == 'tandem duplication': svtype = 'DUP' else: sys.stderr.write('wrong svtype:{}\n'.format(svtype)) ref = '' alt = '' gt = '1|1' vt = 'sv' var = Variant(chrom, start, end, vt, svtype, ref, alt, gt) variants[sample_id].append(var) for sample, vars in variants.items(): for i in range(len(vars) - 1): if vars[i].chrid == vars[i + 1].chrid and vars[i].start == vars[i + 1].start\ and vars[i].end == vars[i + 1].end: print('duplicate variant, sample:{}, variants:{}'.format(sample, str(vars[i]))) print('Number of SV samples:', len(variants)) return variants
def getSSM(ssm_file): variants = {} # icgc_sample_id: variants processed_mut = set() # processsed mutation to handle duplicate records # for i in range(len(ssm)): with open(ssm_file, 'r') as fin: ln = fin.readline() fields = re.split('\t', ln) field2Id = {} for i in range(len(fields)): field2Id[fields[i]] = i icgc_mutation_id = field2Id['icgc_mutation_id'] chromosome_id = field2Id['chromosome'] chromosome_start_id = field2Id['chromosome_start'] chromosome_end_id = field2Id['chromosome_end'] mutation_type_id = field2Id['mutation_type'] reference_genome_allele_id = field2Id['reference_genome_allele'] if 'tumour_genotype' in fields: tumour_genotype_id = field2Id['tumour_genotype'] else: mutated_to_allele_id = field2Id['mutated_to_allele'] icgc_sample_id = field2Id['icgc_sample_id'] for ln in fin.readlines(): st = re.split('\t', ln) mut_id = st[icgc_mutation_id] if mut_id in processed_mut: #print(mut_id) continue processed_mut.add(mut_id) chrid = str(st[chromosome_id]).upper() if not re.search('^[0-9XY]+', chrid): print(chrid) continue start = int(st[chromosome_start_id]) - 1 end = int(st[chromosome_end_id]) vt = st[mutation_type_id] ref = st[reference_genome_allele_id] if 'tumour_genotype' in fields: # control_gt = ssm.loc[i, 'control_genotype'] tumor_gt = st[tumour_genotype_id] else: tumor_gt = st[mutated_to_allele_id] sample_id = st[icgc_sample_id] if sample_id not in variants: variants[sample_id] = [] ''' mutating by replacing ref. with alt. if alt == '', in insertation, it means no insertation ( if ref == '') in deletion, it means deleting ref if insertion, ref is always -, must be converted to '' ''' # Variant(sample, rc.CHROM, start, end, dvt, dsvtype, rc.REF, rc.ALT, gt ) ref = '' if ref == '-' else ref if re.search('substitution', vt): vt = 'snp' svtype = '' elif re.search('deletion', vt): vt = 'indel' svtype = 'del' elif re.search('insertion', vt): vt = 'indel' svtype = 'ins' # alternative alt = re.split('[|/]', tumor_gt) alt = [x if x != '-' else '' for x in alt] # convert '-' to empty if len(alt) == 1: # if there is only one allele, make another from it alt.append(alt[0]) # 0: for reference seq, therefore + 1 # if insertion, gt = '' can be 0 gt = '|'.join([str(alt.index(x) + 1) for x in alt]) if vt == 'snp' and ref == '': print('error, snp but ref. is not available') #print('chrom:{}, start:{}, end:{}, vt:{}, subtype:{}, ref:{}, alt:{}'.format(chrid, start, end, vt, svtype, ref, alt)) var = Variant('chr' + chrid, start, end, vt, svtype, ref, alt, gt) variants[sample_id].append(var) print('Number of sample:{}, number of variants{}'.format(len(variants), len(processed_mut))) return variants
def get_sv(stvm_file: str) -> Dict: """Obtains and parses the structural variant file from ICGC Parameters ---------- stvm_file : the structural variant file from ICGC in a tsv file format Returns ------- variants: a dictionary of list of Variant objects """ variants = {} processed_sv = set() with open(stvm_file, 'r') as fin: ln = fin.readline() fields = re.split('\t', ln) field2_id = {} # for i in range(len(fields)): # field2Id[fields[i]] = i # enumerate approach for i, field in enumerate(fields): field2_id[field] = i sv_header_id = field2_id['sv_id'] variant_type_id = field2_id['variant_type'] chr_from_id = field2_id['chr_from'] chr_to_id = field2_id['chr_to'] icgc_sample_id = field2_id['icgc_sample_id'] chr_from_bkpt_id = field2_id['chr_from_bkpt'] chr_to_bkpt_id = field2_id['chr_to_bkpt'] for ln in fin.readlines(): st = re.split('\t', ln) sv_id = st[sv_header_id] if sv_id in processed_sv: continue processed_sv.add(sv_id) svtype = st[variant_type_id] # if svtype == 'unbalanced translocation': # continue chrid_from = st[chr_from_id].upper() chrid_to = st[chr_to_id].upper() # ignore inter-chromosome SV if (chrid_from != chrid_to) or ( not re.search('^[0-9XY]+', chrid_from)) or \ (not re.search('^[0-9XY]+', chrid_to)): continue sample_id = st[icgc_sample_id] if sample_id not in variants: variants[sample_id] = [] chrom = 'chr' + chrid_from start = int(st[chr_from_bkpt_id]) - 1 end = int(st[chr_to_bkpt_id]) if svtype == 'deletion': svtype = 'DEL' elif svtype == 'inversion': svtype = 'INV' elif svtype == 'tandem duplication': svtype = 'DUP' else: sys.stderr.write('wrong svtype:{}\n'.format(svtype)) ref = '' alt = '' gt = '1|1' vt = 'sv' var = Variant(chrom, start, end, vt, svtype, ref, alt, gt) variants[sample_id].append(var) for sample, varts in variants.items(): for i in range(len(varts) - 1): if varts[i].chrid == varts[i + 1].chrid and \ varts[i].start == varts[i + 1].start and \ varts[i].end == varts[i + 1].end: print('duplicate variant, sample:{}, variants:{}'.format( sample, str(varts[i]))) print('Number of SV samples:', len(variants)) return variants