def munge_transcript(data, RefSeqs): """ Return HGVS correct transcript annotations Filtered with a preferred transcript list NM_006772.1:c.1713G>A """ codon, prot, protein, coding, txpt = ' ', ' ', ' ', ' ', None transcripts = data.get('Transcripts') if transcripts is not None: # Split incoming trans, strip the trailing ) data = multi_split(transcripts.replace('(', ':'), ',(') for d in data: # Split the actual transcript info which is colon separated x = d.split(':') try: gene, txpt, exon, codon, prot = x except ValueError: try: gene, txpt, exon, codon = x except ValueError: continue pref_trans = RefSeqs.get(txpt) #Want to return None for all values if not pref_trans if not pref_trans: continue code = pref_trans + ':' + codon coding = coding + ' ' + code protein = protein + ' ' + prot return coding.strip(), protein.strip()
def parse_sv_alt(data): ''' Split the ALT field from breakend format ]7:55248960]A to chr7:55248960 A[7:55249011[ to chr7:55249011 ''' if data['ID'].endswith(('o', 'h')): a, b = multi_split(data['ALT'], '[]') #Create Event2 #the position has a : in it while the sequence does not #Only process chr1-23, X, Y if ':' in a: try: chrom = a.split(':') data['Event2'] = 'chr' + str(chromosomes[a[0]]) + str(a[1:]) data['Seq'] = b except KeyError: pass else: try: chrom = b.split(':') data['Event2'] = 'chr' + str(chromosomes[b[0]]) + str(b[1:]) data['Seq'] = a except KeyError: pass elif data['ID'].endswith('b'): data['Event2'] = 'SingleBreakEnd' data['Seq'] = data['ALT'] else: print("some weird data was encountered", data) sys.exit() return pd.Series(data)
def process_files(infiles, tab, filetype): """ Rename the analysis files for workbook """ for fname in infiles: (f_path, f_name) = os.path.split(fname) if re.search(str(filetype), f_name): (f_short_name, f_extension) = os.path.splitext(f_name) sheet_name = multi_split(f_short_name, '._') try: #48_A03_BROv7_HA0186_NA12878_QC_Analysis if sheet_name[-2] == 'QC': sheet_name = '0_QC' #48_A03_BROv7_HA0186_NA12878_Quality_Analysis elif sheet_name[-2] == 'Quality': sheet_name = '1_QC_Metrics' #48_A03_BROv7_HA0186_NA12878_CNV_QC_[Exon/Gene]_Analysis elif sheet_name[-3] == 'QC': if sheet_name[-2] == 'Gene': sheet_name = '2_QC_by_Gene' elif sheet_name[-2] == 'Exon': sheet_name = '3_QC_by_Exon' #48_A03_BROv7_HA0186_NA12878_CNV_[Exon/Gene]_Analysis elif sheet_name[-3] == 'CNV': if sheet_name[-2] == 'Gene': sheet_name = '7_CNV_Gene' elif sheet_name[-2] == 'Exon': sheet_name = '8_CNV_Exon' #48_A03_BROv7_HA0186_NA12878_SV_Analysis elif sheet_name[-2] == 'SV': sheet_name = '4_SV_Gridss' #48_A03_BROv7_HA0186_NA12878_Breakdancer_Analysis elif sheet_name[-2] == 'Breakdancer': sheet_name = '5_SV_Breakdancer' #48_A03_BROv7_HA0186_NA12878_Pindel_Analysis elif sheet_name[-2] == 'Pindel': sheet_name = '6_SV_Pindel' #48_A03_BROv7_HA0186_NA12878_Genotype_Analysis elif sheet_name[-2] == 'Genotype': sheet_name = '9_Clinically_Flagged' #48_A03_BROv7_HA0186_NA12878_MSI_Analysis elif sheet_name[-2] == 'MSI': sheet_name = '12_MSI' #48_A03_BROv7_HA0186_NA12878_Amplicon_Analysis elif sheet_name[-2] == 'Amplicon': sheet_name = '13_Amplicons' #48_A03_BROv7_HA0186_NA12878_PolyHunter_Analysis elif sheet_name[-2] == 'PolyHunter': sheet_name = '14_PolyHunter' #48_A03_BROv7_HA0186_NA12878_SNP_Indel_Analysis elif sheet_name[-2] == 'SNP': sheet_name = '10_SNP' #48_A03_BROv7_HA0186_NA12878_INDEL_Analysis elif sheet_name[-2] == 'INDEL': sheet_name = '11_INDEL' if sheet_name == tab: return sheet_name, fname except IndexError: continue
def testSplitString(self): """ Tests spliting a string given string of split characters Used to split path names in crawlers """ result=multi_split('/home/genetics/data/run_info', '/_') self.assertEquals(result, ['home','genetics','data','run','info'])
def testSplitString(self): """ Tests spliting a string given string of split characters Used to split path names in crawlers """ result = ann.multi_split('/home/genetics/data/run_info', '/_') self.assertEquals(result, ['home', 'genetics', 'data', 'run', 'info'])
def munge_pfx(pfx): """ Change the pfx output in files to a shorter version """ output=multi_split(pfx, '/_.') keys=['run','well','library-version','machine-run','control'] pfx_info=dict(zip(keys,output)) pfx_info['control']=check_control(pfx_info['control']) pfx_info['mini-pfx']='{well}{control}'.format(**pfx_info) pfx_info['pfx']='{well}{control}_{library-version}'.format(**pfx_info) return pfx_info
def munge_path(pth): """ Get date, run, project, machine, assay, prep-type from path """ output = multi_split(pth, '/_') #Assuming we want YYMMDD_RUN_PROJECT if output[-1] == 'output': output = output[-4:-1] keys = ['date', 'run', 'project'] #If old version of data that isn't in a 'output' subfolder elif len(output) == 5: keys = ['date', 'machine', 'assay', 'run', 'version', 'project'] #check that the third item is the assay if not output[2].lower() in ASSAY_CODES.values(): #is the second item is the assay? if not output[1].lower() in ASSAY_CODES.values(): print "not a good path", output project = output[1] + output[3].strip('run') output = [ output[0], output[2], output[1], output[3], output[4], project ] else: project = output[2] + output[3].strip('run') output = [ output[0], output[1], output[2], output[3], output[4], project ] else: output = output[-3:] keys = ['date', 'run', 'project'] pathinfo = dict(zip(keys, output)) pathinfo['date'] = munge_date(pathinfo['date']) pathinfo['project'] = pathinfo['project'].lower() #Set Machine if re.search('[%s][A-Z]\d+' % ''.join(MACHINE_CODES.keys()), pathinfo['run']): pathinfo['machine'] = MACHINE_CODES[pathinfo['run'][0]] #Set assay for a in ASSAY_CODES.keys(): if re.search(a, pathinfo['project'], re.IGNORECASE): pathinfo['assay'] = ASSAY_CODES[a] elif re.search('MSI', pathinfo['project'], re.IGNORECASE): pathinfo['assay'] = 'msi-plus' #Set prep type if re.search('kapa', pathinfo['project']): pathinfo['prep_type'] = 'kapa' elif re.search('hotspot', pathinfo['assay']): pathinfo['prep_type'] = 'truseq' else: pathinfo['prep_type'] = 'sure_select' return pathinfo
def munge_path(pth): """ Get date, run, project, machine, assay, prep-type from path """ output=multi_split(pth, '/_') #Assuming we want YYMMDD_RUN_PROJECT if output[-1]=='output': output=output[-4:-1] keys=['date','run', 'project'] #If old version of data that isn't in a 'output' subfolder elif len(output)==5: keys=['date','machine','assay','run','version','project'] #check that the third item is the assay if not output[2].lower() in ASSAY_CODES.values(): #is the second item is the assay? if not output[1].lower() in ASSAY_CODES.values(): print "not a good path", output project=output[1]+output[3].strip('run') output=[output[0],output[2],output[1],output[3], output[4],project] else: project=output[2]+output[3].strip('run') output=[output[0],output[1],output[2],output[3], output[4],project] else: output=output[-3:] keys=['date','run', 'project'] pathinfo = dict(zip(keys,output)) pathinfo['date']=munge_date(pathinfo['date']) pathinfo['project']=pathinfo['project'].lower() #Set Machine if re.search('[%s][A-Z]\d+' % ''.join(MACHINE_CODES.keys()), pathinfo['run']): pathinfo['machine']= MACHINE_CODES[pathinfo['run'][0]] #Set assay for a in ASSAY_CODES.keys(): if re.search(a, pathinfo['project'], re.IGNORECASE): pathinfo['assay'] = ASSAY_CODES[a] elif re.search('MSI', pathinfo['project'], re.IGNORECASE): pathinfo['assay'] = 'msi-plus' #Set prep type if re.search('kapa', pathinfo['project']): pathinfo['prep_type']='kapa' elif re.search('hotspot', pathinfo['assay']): pathinfo['prep_type']='truseq' else: pathinfo['prep_type']='sure_select' return pathinfo
def munge_transcript(data, RefSeqs): """ Return HGVS correct transcript annotations Filtered with a preferred transcript list NM_006772.1:c.1713G>A """ CODING, PROTEIN = [], [] transcripts = data.get('Transcripts') if transcripts is not None: # Split incoming trans, strip the trailing ) data1 = multi_split(transcripts.replace('(', ':'), ',(') #Remove duplicate transcription entries data = list(set(data1)) for d in data: codon, prot, protein, coding, txpt = ' ', ' ', ' ', ' ', None # Split the actual transcript info which is colon separated x = d.split(':') #5: ['PRSS1', 'NM_002769', 'exon4', 'c.567T>C', 'p.L189L'] if len(x)==5: gene, txpt, exon, codon, prot = x elif len(x)==4: #4: ['POLE', 'NM_006231', 'exon25', 'c.2865-4T>-'] gene, txpt, exon, codon = x elif len(x)==3: #3: ['RAD50', 'NM_005732', 'c.-38G>A'] if 'NM' in x[1]: gene, txpt, codon = x #3: ['NM_005590','exon5','c.315-4T>-'] elif 'NM' in x[0]: txpt, exon, codon = x elif len(x)==2: #2: ['NM_001290310', 'c.*513_*514insATC'] txpt, codon = x elif len(x)==1: continue else: sys.exit("don't know how to parse %s" % d) pref_trans = RefSeqs.get(txpt) #Want to return None for all values if not pref_trans if not pref_trans: continue code = pref_trans + ':' + codon CODING.append(code) PROTEIN.append(prot) return ' '.join(CODING), ' '.join(PROTEIN)
def munge_transcript(data, RefSeqs): """ Return HGVS correct transcript annotations Filtered with a preferred transcript list NM_006772.1:c.1713G>A """ CODING, PROTEIN = [], [] transcripts = data.get('Transcripts') if transcripts is not None: # Split incoming trans, strip the trailing ) data1 = multi_split(transcripts.replace('(', ':'), ',(') #Remove duplicate transcription entries data = list(set(data1)) for d in data: codon, prot, protein, coding, txpt = ' ', ' ', ' ', ' ', None # Split the actual transcript info which is colon separated x = d.split(':') #5: ['PRSS1', 'NM_002769', 'exon4', 'c.567T>C', 'p.L189L'] if len(x) == 5: gene, txpt, exon, codon, prot = x elif len(x) == 4: #4: ['POLE', 'NM_006231', 'exon25', 'c.2865-4T>-'] gene, txpt, exon, codon = x elif len(x) == 3: #3: ['RAD50', 'NM_005732', 'c.-38G>A'] if 'NM' in x[1]: gene, txpt, codon = x #3: ['NM_005590','exon5','c.315-4T>-'] elif 'NM' in x[0]: txpt, exon, codon = x elif len(x) == 2: #2: ['NM_001290310', 'c.*513_*514insATC'] txpt, codon = x elif len(x) == 1: continue else: sys.exit("don't know how to parse %s" % d) pref_trans = RefSeqs.get(txpt) #Want to return None for all values if not pref_trans if not pref_trans: continue code = pref_trans + ':' + codon CODING.append(code) PROTEIN.append(prot) return ' '.join(CODING), ' '.join(PROTEIN)
def match(control, run_info): """ Make a list and keep count of variants found in both the qc file and the run output for LMG/OPX-240 sample Matches if chr, start are the same. """ matchedlist = [] nonmatch= [] concount=0 for conline in control: concount=concount+1 for runline in run_info: if runline[0].startswith('Position'): continue else: runline=multi_split(runline[0], 'chr:-,') if (conline[2]==runline[0]) and (int(conline[3])==int(runline[1])): if conline not in matchedlist: matchedlist.append(conline) return matchedlist, concount
def munge_path(pth): """ Get run, machine, assay and capture from path """ output=multi_split(pth, '/_') if len(output)<4: raise ValueError('Incorrect path given. Must be in the format of YYYY-MM-DD_Machine_Assay_Run#_version') pathinfo={} run="" for i in output: i=i.lower() if i.startswith('20'): run=i elif i.startswith('run'): pathinfo['run']=run+'_'+i elif i.startswith('hi') or i.startswith('mi'): pathinfo['machine']=i elif i.startswith('onco') or i.startswith('colo'): pathinfo['assay']=i elif i.startswith('v'): pathinfo['capture']=i return pathinfo
def process_files(infiles, tab, filetype): """ Rename the analysis files for workbook """ for fname in infiles: (f_path, f_name) = os.path.split(fname) if re.search(str(filetype), f_name): (f_short_name, f_extension) = os.path.splitext(f_name) sheet_name = multi_split(f_short_name, '._') try: #48_A03_BROv7_HA0186_NA12878_QC_Analysis if sheet_name[-2] == 'QC': sheet_name = '0_QC' #48_A03_BROv7_HA0186_NA12878_Quality_Analysis elif sheet_name[-2] == 'Quality': sheet_name = '1_QC_Metrics' #NA12878-HP998-HHv2.Coverage_Gene elif sheet_name[-3] == 'Coverage': if sheet_name[-2] == 'Gene': sheet_name = '15_Gene_Coverage' elif sheet_name[-2] == 'Exon': sheet_name = '16_Exon_Coverage' #48_A03_BROv7_HA0186_NA12878_CNV_QC_[Exon/Gene]_Analysis elif sheet_name[-3] == 'QC': if sheet_name[-2] == 'Gene': sheet_name = '2_QC_by_Gene' elif sheet_name[-2] == 'Exon': sheet_name = '3_QC_by_Exon' #48_A03_BROv7_HA0186_NA12878_CNV_[Exon/Gene]_Analysis elif sheet_name[-3] == 'CNV': if sheet_name[-2] == 'Gene': sheet_name = '7_CNV_Gene' elif sheet_name[-2] == 'Exon': sheet_name = '8_CNV_Exon' #48_A03_BROv7_HA0186_NA12878_SV_Analysis elif sheet_name[-2] == 'SV': sheet_name = '4_SV_Gridss' #48_A03_BROv7_HA0186_NA12878_Breakdancer_Analysis elif sheet_name[-2] == 'Breakdancer': sheet_name = '5_SV_Breakdancer' #48_A03_BROv7_HA0186_NA12878_Pindel_Analysis elif sheet_name[-2] == 'Pindel': sheet_name = '6_SV_Pindel' #48_A03_BROv7_HA0186_NA12878_Genotype_Analysis elif sheet_name[-2] == 'Genotype': sheet_name = '9_Clinically_Flagged' #48_A03_BROv7_HA0186_NA12878_MSI_Analysis elif sheet_name[-2] == 'MSI': sheet_name = '12_MSI' #48_A03_BROv7_HA0186_NA12878_Amplicon_Analysis elif sheet_name[-2] == 'Amplicon': sheet_name = '13_Amplicons' #48_A03_BROv7_HA0186_NA12878_PolyHunter_Analysis elif sheet_name[-2] == 'PolyHunter': sheet_name = '14_PolyHunter' #48_A03_BROv7_HA0186_NA12878_SNP_Indel_Analysis elif sheet_name[-2] == 'SNP': sheet_name = '10_SNP' #48_A03_BROv7_HA0186_NA12878_INDEL_Analysis elif sheet_name[-2] == 'INDEL': sheet_name = '11_INDEL' if sheet_name == tab: return sheet_name, fname except IndexError: continue