Beispiel #1
0
def munge_transcript(data, RefSeqs):
    """
    Return HGVS correct transcript annotations
    Filtered with a preferred transcript list
    NM_006772.1:c.1713G>A
    """
    codon, prot, protein, coding, txpt = ' ', ' ', ' ', ' ', None
    transcripts = data.get('Transcripts')
    if transcripts is not None:
        # Split incoming trans, strip the trailing )
        data = multi_split(transcripts.replace('(', ':'), ',(')
        for d in data:
            # Split the actual transcript info which is colon separated
            x = d.split(':')
            try:
                gene, txpt, exon, codon, prot = x

            except ValueError:
                try:
                    gene, txpt, exon, codon = x
                except ValueError:
                    continue
            pref_trans = RefSeqs.get(txpt)
            #Want to return None for all values if not pref_trans
            if not pref_trans:
                continue
            code = pref_trans + ':' + codon
            coding = coding + ' ' + code
            protein = protein + ' ' + prot
    return coding.strip(), protein.strip()
Beispiel #2
0
def parse_sv_alt(data):
    ''' Split the ALT field from breakend format 
    ]7:55248960]A  to chr7:55248960
    A[7:55249011[  to chr7:55249011
    '''
    if data['ID'].endswith(('o', 'h')):
        a, b = multi_split(data['ALT'], '[]')
        #Create Event2
        #the position has a : in it while the sequence does not
        #Only process chr1-23, X, Y
        if ':' in a:
            try:
                chrom = a.split(':')
                data['Event2'] = 'chr' + str(chromosomes[a[0]]) + str(a[1:])
                data['Seq'] = b
            except KeyError:
                pass
        else:
            try:
                chrom = b.split(':')
                data['Event2'] = 'chr' + str(chromosomes[b[0]]) + str(b[1:])
                data['Seq'] = a
            except KeyError:
                pass
    elif data['ID'].endswith('b'):
        data['Event2'] = 'SingleBreakEnd'
        data['Seq'] = data['ALT']
    else:
        print("some weird data was encountered", data)
        sys.exit()
    return pd.Series(data)
Beispiel #3
0
def process_files(infiles, tab, filetype):
    """
    Rename the analysis files for workbook
    """
    for fname in infiles:
        (f_path, f_name) = os.path.split(fname)
        if re.search(str(filetype), f_name):
            (f_short_name, f_extension) = os.path.splitext(f_name)
            sheet_name = multi_split(f_short_name, '._')
            try:
                #48_A03_BROv7_HA0186_NA12878_QC_Analysis
                if sheet_name[-2] == 'QC':
                    sheet_name = '0_QC'
                #48_A03_BROv7_HA0186_NA12878_Quality_Analysis
                elif sheet_name[-2] == 'Quality':
                    sheet_name = '1_QC_Metrics'
                #48_A03_BROv7_HA0186_NA12878_CNV_QC_[Exon/Gene]_Analysis
                elif sheet_name[-3] == 'QC':
                    if sheet_name[-2] == 'Gene':
                        sheet_name = '2_QC_by_Gene'
                    elif sheet_name[-2] == 'Exon':
                        sheet_name = '3_QC_by_Exon'
                #48_A03_BROv7_HA0186_NA12878_CNV_[Exon/Gene]_Analysis
                elif sheet_name[-3] == 'CNV':
                    if sheet_name[-2] == 'Gene':
                        sheet_name = '7_CNV_Gene'
                    elif sheet_name[-2] == 'Exon':
                        sheet_name = '8_CNV_Exon'
                #48_A03_BROv7_HA0186_NA12878_SV_Analysis
                elif sheet_name[-2] == 'SV':
                    sheet_name = '4_SV_Gridss'
                #48_A03_BROv7_HA0186_NA12878_Breakdancer_Analysis
                elif sheet_name[-2] == 'Breakdancer':
                    sheet_name = '5_SV_Breakdancer'
                #48_A03_BROv7_HA0186_NA12878_Pindel_Analysis
                elif sheet_name[-2] == 'Pindel':
                    sheet_name = '6_SV_Pindel'
                #48_A03_BROv7_HA0186_NA12878_Genotype_Analysis
                elif sheet_name[-2] == 'Genotype':
                    sheet_name = '9_Clinically_Flagged'
                #48_A03_BROv7_HA0186_NA12878_MSI_Analysis
                elif sheet_name[-2] == 'MSI':
                    sheet_name = '12_MSI'
                #48_A03_BROv7_HA0186_NA12878_Amplicon_Analysis
                elif sheet_name[-2] == 'Amplicon':
                    sheet_name = '13_Amplicons'
                #48_A03_BROv7_HA0186_NA12878_PolyHunter_Analysis
                elif sheet_name[-2] == 'PolyHunter':
                    sheet_name = '14_PolyHunter'
                #48_A03_BROv7_HA0186_NA12878_SNP_Indel_Analysis
                elif sheet_name[-2] == 'SNP':
                    sheet_name = '10_SNP'
                #48_A03_BROv7_HA0186_NA12878_INDEL_Analysis
                elif sheet_name[-2] == 'INDEL':
                    sheet_name = '11_INDEL'
                if sheet_name == tab:
                    return sheet_name, fname

            except IndexError:
                continue
Beispiel #4
0
 def testSplitString(self):
     """
     Tests spliting a string given string of split characters
     Used to split path names in crawlers
     """
     result=multi_split('/home/genetics/data/run_info', '/_')
     self.assertEquals(result, ['home','genetics','data','run','info'])
Beispiel #5
0
 def testSplitString(self):
     """
     Tests spliting a string given string of split characters
     Used to split path names in crawlers
     """
     result = ann.multi_split('/home/genetics/data/run_info', '/_')
     self.assertEquals(result, ['home', 'genetics', 'data', 'run', 'info'])
Beispiel #6
0
def munge_pfx(pfx):
    """
    Change the pfx output in files to a shorter version
    """
    output=multi_split(pfx, '/_.')
    keys=['run','well','library-version','machine-run','control']
    pfx_info=dict(zip(keys,output))
    pfx_info['control']=check_control(pfx_info['control'])
    pfx_info['mini-pfx']='{well}{control}'.format(**pfx_info)
    pfx_info['pfx']='{well}{control}_{library-version}'.format(**pfx_info)
    return pfx_info
Beispiel #7
0
def munge_path(pth):
    """
    Get date, run, project, machine, assay, prep-type from path
    """
    output = multi_split(pth, '/_')
    #Assuming we want YYMMDD_RUN_PROJECT
    if output[-1] == 'output':
        output = output[-4:-1]
        keys = ['date', 'run', 'project']
    #If old version of data that isn't in a 'output' subfolder
    elif len(output) == 5:
        keys = ['date', 'machine', 'assay', 'run', 'version', 'project']
        #check that the third item is the assay
        if not output[2].lower() in ASSAY_CODES.values():
            #is the second item is the assay?
            if not output[1].lower() in ASSAY_CODES.values():
                print "not a good path", output
            project = output[1] + output[3].strip('run')
            output = [
                output[0], output[2], output[1], output[3], output[4], project
            ]
        else:
            project = output[2] + output[3].strip('run')
            output = [
                output[0], output[1], output[2], output[3], output[4], project
            ]
    else:
        output = output[-3:]
        keys = ['date', 'run', 'project']
    pathinfo = dict(zip(keys, output))
    pathinfo['date'] = munge_date(pathinfo['date'])
    pathinfo['project'] = pathinfo['project'].lower()
    #Set Machine
    if re.search('[%s][A-Z]\d+' % ''.join(MACHINE_CODES.keys()),
                 pathinfo['run']):
        pathinfo['machine'] = MACHINE_CODES[pathinfo['run'][0]]
    #Set assay
    for a in ASSAY_CODES.keys():
        if re.search(a, pathinfo['project'], re.IGNORECASE):
            pathinfo['assay'] = ASSAY_CODES[a]
        elif re.search('MSI', pathinfo['project'], re.IGNORECASE):
            pathinfo['assay'] = 'msi-plus'
    #Set prep type
    if re.search('kapa', pathinfo['project']):
        pathinfo['prep_type'] = 'kapa'
    elif re.search('hotspot', pathinfo['assay']):
        pathinfo['prep_type'] = 'truseq'
    else:
        pathinfo['prep_type'] = 'sure_select'

    return pathinfo
Beispiel #8
0
def munge_path(pth):
    """
    Get date, run, project, machine, assay, prep-type from path
    """
    output=multi_split(pth, '/_')
    #Assuming we want YYMMDD_RUN_PROJECT
    if output[-1]=='output':
        output=output[-4:-1]
        keys=['date','run', 'project']
    #If old version of data that isn't in a 'output' subfolder
    elif len(output)==5:
        keys=['date','machine','assay','run','version','project']
        #check that the third item is the assay
        if not output[2].lower() in ASSAY_CODES.values():
            #is the second item is the assay?
            if not output[1].lower() in ASSAY_CODES.values():
                print "not a good path", output
            project=output[1]+output[3].strip('run')
            output=[output[0],output[2],output[1],output[3], output[4],project] 
        else:
            project=output[2]+output[3].strip('run')
            output=[output[0],output[1],output[2],output[3], output[4],project] 
    else:
        output=output[-3:]
        keys=['date','run', 'project']
    pathinfo = dict(zip(keys,output))
    pathinfo['date']=munge_date(pathinfo['date'])
    pathinfo['project']=pathinfo['project'].lower()
    #Set Machine
    if re.search('[%s][A-Z]\d+' % ''.join(MACHINE_CODES.keys()), pathinfo['run']):
        pathinfo['machine']= MACHINE_CODES[pathinfo['run'][0]]
    #Set assay
    for a in ASSAY_CODES.keys():
        if re.search(a, pathinfo['project'], re.IGNORECASE):
            pathinfo['assay'] = ASSAY_CODES[a]
        elif re.search('MSI', pathinfo['project'], re.IGNORECASE):
            pathinfo['assay'] = 'msi-plus'
    #Set prep type
    if re.search('kapa', pathinfo['project']):
        pathinfo['prep_type']='kapa'
    elif re.search('hotspot', pathinfo['assay']):
        pathinfo['prep_type']='truseq'
    else:
        pathinfo['prep_type']='sure_select'

    return pathinfo
Beispiel #9
0
def munge_transcript(data, RefSeqs):
    """
    Return HGVS correct transcript annotations
    Filtered with a preferred transcript list
    NM_006772.1:c.1713G>A
    """
    CODING, PROTEIN = [], []
    transcripts = data.get('Transcripts')
    if transcripts is not None:
        # Split incoming trans, strip the trailing )
        data1 = multi_split(transcripts.replace('(', ':'), ',(')
        #Remove duplicate transcription entries
        data = list(set(data1))
        for d in data:
            codon, prot, protein, coding, txpt = ' ', ' ', ' ', ' ', None
            # Split the actual transcript info which is colon separated
            x = d.split(':')
            #5: ['PRSS1', 'NM_002769', 'exon4', 'c.567T>C', 'p.L189L']
            if len(x)==5:
                gene, txpt, exon, codon, prot = x
            elif len(x)==4:
            #4: ['POLE', 'NM_006231', 'exon25', 'c.2865-4T>-']
                gene, txpt, exon, codon = x
            elif len(x)==3:
            #3: ['RAD50', 'NM_005732', 'c.-38G>A']
                if 'NM' in x[1]:
                    gene, txpt, codon = x
            #3: ['NM_005590','exon5','c.315-4T>-']
                elif 'NM' in x[0]:
                    txpt, exon, codon = x
            elif len(x)==2:
            #2: ['NM_001290310', 'c.*513_*514insATC']
                txpt, codon = x
            elif len(x)==1:
                continue
            else:
                sys.exit("don't know how to parse %s" % d)
            pref_trans = RefSeqs.get(txpt)
            #Want to return None for all values if not pref_trans
            if not pref_trans:
                continue
            code = pref_trans + ':' + codon
            CODING.append(code)
            PROTEIN.append(prot)
    return ' '.join(CODING), ' '.join(PROTEIN)
Beispiel #10
0
def munge_transcript(data, RefSeqs):
    """
    Return HGVS correct transcript annotations
    Filtered with a preferred transcript list
    NM_006772.1:c.1713G>A
    """
    CODING, PROTEIN = [], []
    transcripts = data.get('Transcripts')
    if transcripts is not None:
        # Split incoming trans, strip the trailing )
        data1 = multi_split(transcripts.replace('(', ':'), ',(')
        #Remove duplicate transcription entries
        data = list(set(data1))
        for d in data:
            codon, prot, protein, coding, txpt = ' ', ' ', ' ', ' ', None
            # Split the actual transcript info which is colon separated
            x = d.split(':')
            #5: ['PRSS1', 'NM_002769', 'exon4', 'c.567T>C', 'p.L189L']
            if len(x) == 5:
                gene, txpt, exon, codon, prot = x
            elif len(x) == 4:
                #4: ['POLE', 'NM_006231', 'exon25', 'c.2865-4T>-']
                gene, txpt, exon, codon = x
            elif len(x) == 3:
                #3: ['RAD50', 'NM_005732', 'c.-38G>A']
                if 'NM' in x[1]:
                    gene, txpt, codon = x
            #3: ['NM_005590','exon5','c.315-4T>-']
                elif 'NM' in x[0]:
                    txpt, exon, codon = x
            elif len(x) == 2:
                #2: ['NM_001290310', 'c.*513_*514insATC']
                txpt, codon = x
            elif len(x) == 1:
                continue
            else:
                sys.exit("don't know how to parse %s" % d)
            pref_trans = RefSeqs.get(txpt)
            #Want to return None for all values if not pref_trans
            if not pref_trans:
                continue
            code = pref_trans + ':' + codon
            CODING.append(code)
            PROTEIN.append(prot)
    return ' '.join(CODING), ' '.join(PROTEIN)
Beispiel #11
0
def match(control, run_info):
    """
    Make a list and keep count of variants found in both the qc file and the run output for LMG/OPX-240 sample
    Matches if chr, start are the same. 
    """
    matchedlist = []
    nonmatch= []
    concount=0
    for conline in control:
        concount=concount+1
        for runline in run_info:
            if runline[0].startswith('Position'):
                continue
            else:
                runline=multi_split(runline[0], 'chr:-,')
                if (conline[2]==runline[0]) and (int(conline[3])==int(runline[1])):
                    if conline not in matchedlist:
                        matchedlist.append(conline)
    return matchedlist, concount
Beispiel #12
0
def munge_path(pth):
    """
    Get run, machine, assay and capture from path
    """
    output=multi_split(pth, '/_')
    if len(output)<4:
        raise ValueError('Incorrect path given. Must be in the format of YYYY-MM-DD_Machine_Assay_Run#_version')
    pathinfo={}
    run=""
    for i in output:
        i=i.lower()
        if i.startswith('20'):
            run=i
        elif i.startswith('run'):
            pathinfo['run']=run+'_'+i
        elif i.startswith('hi') or i.startswith('mi'):
            pathinfo['machine']=i
        elif i.startswith('onco') or i.startswith('colo'):
            pathinfo['assay']=i
        elif i.startswith('v'):
            pathinfo['capture']=i

    return pathinfo
Beispiel #13
0
def process_files(infiles, tab, filetype):
    """
    Rename the analysis files for workbook
    """
    for fname in infiles:
        (f_path, f_name) = os.path.split(fname)
        if re.search(str(filetype), f_name):
            (f_short_name, f_extension) = os.path.splitext(f_name)
            sheet_name = multi_split(f_short_name, '._')
            try:
                #48_A03_BROv7_HA0186_NA12878_QC_Analysis
                if sheet_name[-2] == 'QC':
                    sheet_name = '0_QC'
                #48_A03_BROv7_HA0186_NA12878_Quality_Analysis
                elif sheet_name[-2] == 'Quality':
                    sheet_name = '1_QC_Metrics'
                #NA12878-HP998-HHv2.Coverage_Gene
                elif sheet_name[-3] == 'Coverage':
                    if sheet_name[-2] == 'Gene':
                        sheet_name = '15_Gene_Coverage'
                    elif sheet_name[-2] == 'Exon':
                        sheet_name = '16_Exon_Coverage'
                #48_A03_BROv7_HA0186_NA12878_CNV_QC_[Exon/Gene]_Analysis
                elif sheet_name[-3] == 'QC':
                    if sheet_name[-2] == 'Gene':
                        sheet_name = '2_QC_by_Gene'
                    elif sheet_name[-2] == 'Exon':
                        sheet_name = '3_QC_by_Exon'
                #48_A03_BROv7_HA0186_NA12878_CNV_[Exon/Gene]_Analysis
                elif sheet_name[-3] == 'CNV':
                    if sheet_name[-2] == 'Gene':
                        sheet_name = '7_CNV_Gene'
                    elif sheet_name[-2] == 'Exon':
                        sheet_name = '8_CNV_Exon'
                #48_A03_BROv7_HA0186_NA12878_SV_Analysis
                elif sheet_name[-2] == 'SV':
                    sheet_name = '4_SV_Gridss'
                #48_A03_BROv7_HA0186_NA12878_Breakdancer_Analysis
                elif sheet_name[-2] == 'Breakdancer':
                    sheet_name = '5_SV_Breakdancer'
                #48_A03_BROv7_HA0186_NA12878_Pindel_Analysis
                elif sheet_name[-2] == 'Pindel':
                    sheet_name = '6_SV_Pindel'
                #48_A03_BROv7_HA0186_NA12878_Genotype_Analysis
                elif sheet_name[-2] == 'Genotype':
                    sheet_name = '9_Clinically_Flagged'
                #48_A03_BROv7_HA0186_NA12878_MSI_Analysis
                elif sheet_name[-2] == 'MSI':
                    sheet_name = '12_MSI'
                #48_A03_BROv7_HA0186_NA12878_Amplicon_Analysis
                elif sheet_name[-2] == 'Amplicon':
                    sheet_name = '13_Amplicons'
                #48_A03_BROv7_HA0186_NA12878_PolyHunter_Analysis
                elif sheet_name[-2] == 'PolyHunter':
                    sheet_name = '14_PolyHunter'
                #48_A03_BROv7_HA0186_NA12878_SNP_Indel_Analysis
                elif sheet_name[-2] == 'SNP':
                    sheet_name = '10_SNP'
                #48_A03_BROv7_HA0186_NA12878_INDEL_Analysis
                elif sheet_name[-2] == 'INDEL':
                    sheet_name = '11_INDEL'
                if sheet_name == tab:
                    return sheet_name, fname

            except IndexError:
                continue