Esempio n. 1
0
import sys
import os.path
import os
import numpy as np
import csv
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from ngs_scripts.Aggregate import returnSampleDict

alldict = returnSampleDict()

varnumberntdict = {}
for key in alldict:
    if alldict[key].strain.upper() == 'FLUB':
        varcounter = 0
        # print alldict[key].smartid
        varlist = alldict[key].varlist
        for var in varlist:
            if var.segment == 'HA':
                if var.totalcount >= 200 and var.binocheck.upper() == 'TRUE':
                    # print var.ntpos,var.major,var.majorfreq,var.minor,var.minorfreq
                    varcounter+=1
        varnumberntdict[alldict[key].smartid] =  (key,str(varcounter))

thefile = open('flub_varcount.txt','w')

print>>thefile,'smartid,sampleid,varcount'
for key in varnumberntdict:
    print>>thefile, key+','+varnumberntdict[key][0]+','+varnumberntdict[key][1]
Esempio n. 2
0
        else:
            seq.append(line)
    if name:
        yield (name, "".join(seq))


condict = {}
with open(consensuspath) as fp:
    for name, seq in read_fasta(fp):
        # print(name, seq)
        condict[name[1:].split(" ")[0]] = seq

# for key in condict:
#     print key, condict[key]

somedict = returnSampleDict()

ultvarlist = []
for key in somedict:
    if somedict[key].strain == "H3N2":
        # print key,somedict[key].smartid
        for eachvar in somedict[key].varlist:
            if eachvar.segment == "HA":
                if eachvar.binocheck == "TRUE":
                    if eachvar.totalcount >= 100:
                        ultvarlist.append(eachvar.ntpos)
                        # printrow1 =  [somedict[key].smartid,eachvar.segment,eachvar.ntpos,eachvar.major,eachvar.majorfreq,'major']
                        # printrow2 =  [somedict[key].smartid,eachvar.segment,eachvar.ntpos,eachvar.minor,eachvar.minorfreq,'minor']
                        # print ','.join(map(str, printrow1))
                        # print ','.join(map(str, printrow2))
ultvarlist = list(set(ultvarlist))
Esempio n. 3
0
def l1_norm(somestrain, CUTOFF = 0.03, COVERCUTOFF = 200):

    #Obtain reference files
    if somestrain.upper() == 'FLUB':
        refdict = snplists.Reference.open_fasta('flub_reference.fa')
    elif somestrain.upper() == 'H3N2':
        refdict = snplists.Reference.open_fasta('flua_reference.fa')

    #Retrieves variant information for samples
    alldict = returnSampleDict()
    
    #Find samples that only have the strain we specified
    sampdict = {}
    for key in alldict:
        if alldict[key].strain.upper() == somestrain.upper(): 
            sampdict[key] = alldict[key]
    keylist = list(sampdict.keys())
    keylist.sort()
    officialnamelist = []
    for key in keylist:
        officialnamelist.append(sampdict[key].smartid)
    #Peform L1_norm for each segment
    for SEGMENT in refdict:
        # if SEGMENT == 'HA':
        dismatrix = np.zeros((len(keylist),len(keylist)))

        for aidx,asamp in enumerate(keylist): #column
            for bidx,bsamp in enumerate(keylist): #row
                if aidx == bidx:
                    dismatrix[aidx,bidx] = 0
                elif aidx > bidx: #Because matrix is symmetrical we cut work in half
                    dismatrix[aidx,bidx] = dismatrix[bidx,aidx]
                else:
                    # print aidx,bidx
                    avarlist = sampdict[asamp].varlist
                    bvarlist = sampdict[bsamp].varlist

                    asegntlist = []
                    bsegntlist = []
                    asegntdict = {}
                    bsegntdict = {}

                    #Filter for good quality variants
                    for var in avarlist:
                        if var.totalcount > COVERCUTOFF and var.binocheck.upper() == 'TRUE' and var.minorfreq > CUTOFF and var.segment == SEGMENT:
                            asegntlist.append(var.segment+'_'+str(var.ntpos))
                            asegntdict[var.segment+'_'+str(var.ntpos)] = var 
                    for var in bvarlist:
                        if var.totalcount > COVERCUTOFF and var.binocheck.upper() == 'TRUE' and var.minorfreq > CUTOFF and var.segment == SEGMENT:
                            bsegntlist.append(var.segment+'_'+str(var.ntpos))
                            bsegntdict[var.segment+'_'+str(var.ntpos)] = var 

                    #Combine list of variants
                    unionsegntlist = list(set(asegntlist).union(set(bsegntlist)))
                    unionsegntlist.sort()

                    sampindexlist=[]
                    #Three cases to generating L1_NORM.
                    #Case 1 -> The variant is found in both samples
                    #Case 2 -> The variant is found only in sample A, in that case we take B info from reference.
                    #Case 3 -> The variant is found only in Sample B
                    for segnt in unionsegntlist:
                        #Case 1
                        if segnt in asegntlist and segnt in bsegntlist:
                            a_afreq = asegntdict[segnt].afreq
                            a_cfreq = asegntdict[segnt].cfreq
                            a_gfreq = asegntdict[segnt].gfreq
                            a_tfreq = asegntdict[segnt].tfreq

                            b_afreq = bsegntdict[segnt].afreq
                            b_cfreq = bsegntdict[segnt].cfreq
                            b_gfreq = bsegntdict[segnt].gfreq
                            b_tfreq = bsegntdict[segnt].tfreq
                        #Case 2
                        elif segnt in asegntlist:
                            a_afreq = asegntdict[segnt].afreq
                            a_cfreq = asegntdict[segnt].cfreq
                            a_gfreq = asegntdict[segnt].gfreq
                            a_tfreq = asegntdict[segnt].tfreq

                            refseg,refpos = segnt.split('_')
                            ref_nt = refdict[refseg][int(refpos)-1]
                            ref_nt = ref_nt.upper()
                            b_afreq = 0.0
                            b_cfreq = 0.0
                            b_gfreq = 0.0
                            b_tfreq = 0.0
                            if ref_nt == 'A':
                                b_afreq = 1.0
                            elif ref_nt == 'C':
                                b_cfreq = 1.0
                            elif ref_nt == 'G':
                                b_gfreq = 1.0
                            elif ref_nt == 'T':
                                b_tfreq = 1.0
                            else:
                                print 'ERROR_ERROR'  
                        #Case 3    
                        elif segnt in bsegntlist:
                            b_afreq = bsegntdict[segnt].afreq
                            b_cfreq = bsegntdict[segnt].cfreq
                            b_gfreq = bsegntdict[segnt].gfreq
                            b_tfreq = bsegntdict[segnt].tfreq

                            refseg,refpos = segnt.split('_')
                            ref_nt = refdict[refseg][int(refpos)-1]
                            ref_nt = ref_nt.upper()
                            a_afreq = 0.0
                            a_cfreq = 0.0
                            a_gfreq = 0.0
                            a_tfreq = 0.0
                            if ref_nt == 'A':
                                a_afreq = 1.0
                            elif ref_nt == 'C':
                                a_cfreq = 1.0
                            elif ref_nt == 'G':
                                a_gfreq = 1.0
                            elif ref_nt == 'T':
                                a_tfreq = 1.0
                            else:
                                print 'ERROR_ERROR'


                        indexvalue = abs(a_afreq-b_afreq)+abs(a_cfreq-b_cfreq)+abs(a_gfreq-b_gfreq)+abs(a_tfreq-b_tfreq)
                        sampindexlist.append(indexvalue)
                    #Sum over sampindexlist (all variants in that segment)
                    inputvalue = sum(sampindexlist)#/float(len(refdict[SEGMENT])) #normalize
                    dismatrix[aidx,bidx] = inputvalue
            

        relpath =  os.getcwd()
        filepath = relpath+'/../FILES/output/'
        # thefile = open(filepath+somestrain+'.namelist.csv','w')
        # for row in dismatrix:
        #     print row
        # df = pd.DataFrame(dismatrix)
        # print df

        df = pd.DataFrame(dismatrix, index=officialnamelist, columns=officialnamelist)
        df.to_csv(filepath+somestrain+'.'+SEGMENT+'.dissim_all.csv', index=True, header=True, sep=',')