def MatchVcfPed(vcfs, peds): res = [] for vcf in vcfs: for ped in peds: if utils.GetBaseName(vcf) == utils.GetBaseName(ped): print utils.GetBaseName(vcf), utils.GetBaseName(ped) res.append((vcf, ped)) break print "Finded vcf-ped pairs:" for v, p in res: print v, p print '\n' return res
def OutputSHMsForVGenes(shm_df, output_config): gene_type_dict = dict() gene_len = dict() num_aligned = dict() for it in shm_df: if it.segment not in gene_type_dict: gene_type_dict[it.segment] = dict() segment_dict = gene_type_dict[it.segment] gene_name = utils.GetBaseName(it.gene_name) if gene_name not in gene_type_dict[it.segment]: gene_type_dict[it.segment][gene_name] = [] gene_len[gene_name] = 0 num_aligned[gene_name] = 0 gene_type_dict[it.segment][gene_name].extend(shm_df[it]) gene_len[gene_name] = max(gene_len[gene_name], it.gene_len) num_aligned[gene_name] += 1 for segment in gene_type_dict: segment_dict = gene_type_dict[segment] for gene_name in segment_dict: num_aligned_seq = num_aligned[gene_name] if num_aligned_seq < 10: continue output_fname = os.path.join( output_config.GetSHMDirBySegment(segment), gene_name) nucl_pos_dict = OutputGeneSHMPlot(segment_dict[gene_name], gene_name, gene_len[gene_name], num_aligned[gene_name], output_fname, output_config.Log()) output_config.AddSHMFileForSegment(segment, output_fname) OutputGeneSHMsToTxt( nucl_pos_dict, num_aligned[gene_name], os.path.join(output_config.GetSHMDirBySegment(segment), gene_name) + '.txt')
def _CreateVJDicts(self): self.vj_dict = dict() self.v_dict = dict() self.j_dict = dict() for i in range(len(self.vj_df)): base_v = utils.GetBaseName(self.vj_df['V_hit'][i]) base_j = utils.GetBaseName(self.vj_df['J_hit'][i]) if (base_v, base_j) not in self.vj_dict: self.vj_dict[(base_v, base_j)] = 0 self.vj_dict[(base_v, base_j)] += 1 if base_v not in self.v_dict: self.v_dict[base_v] = 0 self.v_dict[base_v] += 1 if base_j not in self.j_dict: self.j_dict[base_j] = 0 self.j_dict[base_j] += 1 self.sorted_vs = sorted(self.v_dict.keys()) self.sorted_js = sorted(self.j_dict.keys())
def OutputVJGenesMutability(shm_df, output_config): v_gene_mutability = dict() j_gene_mutability = dict() for it in shm_df: cur_dict = v_gene_mutability if not it.is_variable(): cur_dict = j_gene_mutability gene_name = utils.GetBaseName(it.gene_name) if gene_name not in cur_dict: cur_dict[gene_name] = [] mutability = float(len(shm_df[it])) / it.gene_len cur_dict[gene_name].append(mutability) OutputGeneMutability(v_gene_mutability, output_config.v_mutability, 'V', output_config.Log()) OutputGeneMutability(j_gene_mutability, output_config.j_mutability, 'J', output_config.Log())
def SplitInheritencePattern(InpDir, VCF, PED): vcfs = utils.get_files(InpDir, '.tsv') peds = utils.get_files(InpDir, '.ped') vcf_peds = MatchVcfPed(vcfs, peds) #vcf_peds = [[VCF,PED]] for vcf, ped in vcf_peds: Ped = Pedigree(ped) print "Processing vcf: %s\tpedigree: %s" % (vcf, ped) fout_AR = open(utils.GetBaseName(vcf) + '_AR.tsv', 'wb') fout_AR.write('Autsomal Recessive Variants\n') fout_AD = open(utils.GetBaseName(vcf) + '_AD.tsv', 'wb') fout_AD.write('Autsomal Dominant Variants\n') fout_XL = open(utils.GetBaseName(vcf) + '_XL.tsv', 'wb') fout_XL.write('X-linked Variants\n') fout_CH = open(utils.GetBaseName(vcf) + '_CH.tsv', 'wb') fout_CH.write('Compound Heterozygote\n') fin = open(vcf, 'rb') header = fin.readline() fout_AR.write(header) fout_AD.write(header) fout_XL.write(header) fout_CH.write(header) headerList = header.strip().split('\t') RecordLen = len(headerList) Gene_idx = headerList.index('Annotation') Model_idx = headerList.index('GeneticModels') Format_idx = headerList.index('FORMAT') CH_buffer = [] LastGene = None for l in fin: llist = l.strip().split('\t') Gene = llist[Gene_idx] Model = llist[Model_idx].split(':')[-1] if 'AR' in Model and selectAR(llist, headerList, Ped, Model, Format_idx): fout_AR.write(l) if 'AD' in Model and selectAD_DN(llist, headerList, Ped, Model, Format_idx): fout_AD.write(l) if 'X' in Model: fout_XL.write(l) if Gene != LastGene: if LastGene != None: LookCompoundHeter(CH_buffer, fout_CH, RecordLen, Format_idx, headerList, Ped) CH_buffer = [l] LastGene = Gene else: CH_buffer.append(l) LookCompoundHeter(CH_buffer, fout_CH, RecordLen, Format_idx, headerList, Ped) fout_AR.close() fout_AD.close() fout_XL.close() fout_CH.close() new_dir = os.path.join(InpDir, utils.GetBaseName(vcf)) print new_dir if not os.path.exists(new_dir): os.mkdir(new_dir) shutil.copy(vcf, new_dir) shutil.copy(ped, new_dir) for dest in [ utils.GetBaseName(vcf) + '_AR.tsv', utils.GetBaseName(vcf) + '_AD.tsv', utils.GetBaseName(vcf) + '_XL.tsv', utils.GetBaseName(vcf) + '_CH.tsv' ]: if os.path.exists(os.path.join(new_dir, dest)): os.remove(os.path.join(new_dir, dest)) shutil.move(dest, new_dir)