Esempio n. 1
0
 def __init__(self, graph,verbose=0):
     """Constructor
     Intialize an instance of `Imputation` with a py2neo graph
     and a `CypherQuery` object.
     """
     self.graph = graph
     self.cypher = CypherQuery()
Esempio n. 2
0
 def __init__(self, graph, populations):
     """Constructor
     Intialize an instance of `Imputation` with a py2neo graph
     and a `CypherQuery` object.
     """
     # graph is a link to the graph and populations is the list of populations
     self.graph = graph
     self.cypher = CypherQuery()
     self.populations = populations
Esempio n. 3
0
def test_build_query():
    cypher = CypherQuery()
    for haplo in expected["buildQuery"].keys():
        assert_equal(cypher.buildQuery([haplo]), expected["buildQuery"][haplo],
                     haplo)
Esempio n. 4
0
def test_get_path():
    cypher = CypherQuery()
    for haplo in expected["getPath"].keys():
        assert_equal(cypher.getPath([haplo]), expected["getPath"][haplo],
                     haplo)
Esempio n. 5
0
def test_get_locus():
    cypher = CypherQuery()
    for allele in expected["getLocus"].keys():
        assert_equal(cypher.getLocus(allele), expected["getLocus"][allele],
                     allele)
Esempio n. 6
0
def test_constructer():
    cypher = CypherQuery()
    assert_true(inspect.isclass(CypherQuery))
    assert_equal(cypher.__class__.__name__, "CypherQuery")
    assert_equal(str(type(cypher)),
                 "<class \'hf_cypher.cypherQuery.CypherQuery\'>")
Esempio n. 7
0
class Imputation(object):
    '''
    classdocs
    '''
    def __init__(self, graph, populations):
        """Constructor
        Intialize an instance of `Imputation` with a py2neo graph
        and a `CypherQuery` object.
        """
        # graph is a link to the graph and populations is the list of populations
        self.graph = graph
        self.cypher = CypherQuery()
        self.populations = populations

    def power_find(self, n):
        """produces all powers of 2
        """
        result = []
        binary = bin(n)[:1:-1]
        for x in range(len(binary)):
            if int(binary[x]):
                result.append(x)
        return result

    def gl2haps(self, GL_String):
        # Receives a GL string adn produces a genotype in list structure
        split_hap = GL_String.split('^')
        N_Loci = len(split_hap)
        t1 = []
        t2 = []
        for i in range(N_Loci):
            curr_locus = split_hap[i].split('+')
            if len(curr_locus) == 1:
                return []
            t1.append(curr_locus[0])
            t2.append(curr_locus[1])

        Gen = [t1, t2]
        return {'Genotype': Gen, 'N_Loc': N_Loci}

    def gen_phases(self, gen, n_loci):
        # Generates all phases, but does not handle locus ambiguities
        Phases = []
        N_Phases = 2**(n_loci - 1)  # Total Number of phases
        exists = {}
        for i in range(0, N_Phases):
            H1 = []  # Hap lists
            H2 = []
            M1 = self.power_find(i)  # find all the powers of 2 in i
            L = [0] * n_loci  # Iitiated at 0 for all loci
            for m in M1:
                L[m] = 1
                # take a phase and set it to 1,
                # all others go to the other phase.
            for k in range(n_loci):
                H1.append(gen[L[k]][k])
                H2.append(gen[1 - L[k]][k])
            geno = "^".join(["~".join(sorted(H1)), "~".join(sorted(H2))])
            if geno not in exists:
                exists[geno] = 1
                Phases.append([sorted(H1), sorted(H2)])

        return {'Phases': Phases, 'N_Phases': N_Phases}

    def open_ambiguities(self, hap, loc):
        # This opens all allele ambiguities
        hap_new = []  # produces an empty list of haplotypes
        for k in range(len(hap)):  #slit a given locus in all haps.
            split_loc = hap[k][loc].split('/')
            hap1 = hap[k]
            if len(split_loc) > 1:
                for i in range(len(split_loc)):
                    hap1[loc] = split_loc[i]
                    hap_new.append(hap1[:])
            else:
                hap_new.append(hap1[:])
        return hap_new

    def comp_hap_prob(self, Hap, N_Loc, epsilon):
        haplo_probs = self.get_haplo_freqs(Hap, epsilon)
        probs = list(haplo_probs.values())
        #probs=haplo_probs.values()
        haplos = list(haplo_probs.keys())
        if not haplo_probs:
            return {'Haps': '', 'Probs': ''}
        return {'Haps': haplos, 'Probs': probs}

    def get_haplo_freqs(self, haplos, epsilon):
        haplo_probs = {}
        all_hap = []
        for hap_cand in haplos:
            haplos_joined = ["~".join(sorted(hap)) for hap in hap_cand]
            all_hap.append(haplos_joined)
        haplo_query1 = self.cypher.buildQuery(haplos_joined)
        fq = pa.DataFrame(self.graph.data(haplo_query1))
        if not fq.empty:
            freq1_dic = fq.set_index('abcqr.name')['abcqr.frequency'].to_dict()
            haplo_probs.update(freq1_dic)

        return haplo_probs

    def get_haplo_freqs_miss(self, haplos, epsilon):
        haplo_probs = {}
        all_hap = []
        for hap_cand in haplos:
            haplos_joined = ["~".join(sorted(hap)) for hap in hap_cand]
            all_hap.append(haplos_joined)
        haplo_query1 = self.cypher.buildQuery(haplos_joined)
        haplo_query1
        fq = pa.DataFrame(self.graph.data(haplo_query1))
        if not fq.empty:
            freq1_dic = fq.set_index('abc.name')['abc.frequency'].to_dict()
            haplo_probs.update(freq1_dic)
        return haplo_probs

    def cal_prob(self, probs1, probs2, epsilon):
        # This is the part where we loop over all race combinations.
        # N*N loop, where N is the number of races/
        places = []
        for i in range(len(probs1)):
            for j in range(len(probs2)):
                if probs1[i] * probs2[j] >= epsilon and probs1[i] * probs2[
                        j] != 0:
                    places.append([i, j])
        # places are the indices of the positions in the population array.
        return places

    def comp_phase_prob(self, phases, N_Loc, epsilon):
        # receives a list of phases and computes haps and
        # probabilties and accumulate cartesian product
        geno_seen = set([])
        hap_total = []
        p_total = []
        pop_res = []
        Prob2 = []
        for i in range(len(phases)):
            P1 = self.comp_hap_prob(phases[i][0], N_Loc, epsilon)
            # This will open locus ambiguities and comp probabilities for Hap1
            Haps1 = P1['Haps']
            Prob1 = P1['Probs']
            if len(Prob1) > 0:
                P2 = self.comp_hap_prob(phases[i][1], N_Loc, epsilon)
                # This will do the same for Hap 2;
                Haps2 = P2['Haps']
                Prob2 = P2['Probs']
            for h in range(len(Prob1)):
                for k in range(len(Prob2)):
                    places = self.cal_prob(Prob1[h], Prob2[k], epsilon)
                    for i in range(len(places)):
                        p = (places[i])
                        # avoid reporting the same haplotype pair more than once
                        h1_id = (Haps1[h], self.populations[p[0]])
                        h2_id = (Haps2[k], self.populations[p[1]])
                        geno_id = tuple(sorted([h1_id, h2_id]))
                        if geno_id not in geno_seen:
                            geno_seen.add(geno_id)
                            hap_total.append([geno_id[0][0], geno_id[1][0]])
                            pop_res.append([geno_id[0][1], geno_id[1][1]])
                            # record prob in same order as associated hap & pop
                            # for example, in the WMDA imputation output, this:
                            # D000001,A*02:01~B*44:03~C*02:02~DQB1*02:01~DRB1*07:01,0.00009000,CAU,A*11:01~B*13:02~C*06:02~DQB1*05:01~DRB1*01:01,0.00006000,CAU
                            # should be this:
                            # D000001,A*02:01~B*44:03~C*02:02~DQB1*02:01~DRB1*07:01,0.00006000,CAU,A*11:01~B*13:02~C*06:02~DQB1*05:01~DRB1*01:01,0.00009000,CAU
                            if geno_id[0] == h1_id:
                                p_total.append(
                                    [Prob1[h][p[0]], Prob2[k][p[1]]])
                            else:
                                p_total.append(
                                    [Prob2[k][p[1]], Prob1[h][p[0]]])
        # p_total returns an array of N*N (N is number of populations), hap_total - pairs of haplotypes.
        # pop_res are the names of the populations
        return {'Haps': hap_total, 'Probs': p_total, 'Pops': pop_res}

    def open_phases(self, haps, N_Loc):
        phases = []
        for j in range(len(haps)):
            H1 = []
            H2 = []
            for k in range(2):
                hap_list = []
                hap_list.append(haps[j][k])
                for i in range(N_Loc):
                    hap_list = self.open_ambiguities(hap_list, i)
                if (k == 0):
                    H1.append(hap_list)
                else:
                    H2.append(hap_list)
            phases.append([H1, H2])
        return phases

    def comp_cand(self, gl_string, epsilon=0.0001):
        # receives a list of phases and computes haps and
        # probabilties and accumulate cartesian productEpsilon=0.0001
        chr = self.gl2haps(gl_string)
        chr1 = self.gen_phases(chr['Genotype'], chr['N_Loc'])
        if chr1 == []:
            return
        phases = self.open_phases(chr1['Phases'], chr['N_Loc'])

        n_res = 0
        min_res = 10
        min_epsilon = 1.e-3
        res = {'Haps': 'NaN', 'Probs': 0}
        while (epsilon > 0) & (n_res < min_res):
            epsilon /= 10
            if (epsilon < min_epsilon):
                epsilon = 0.0
            res = self.comp_phase_prob(phases, chr['N_Loc'], epsilon)
            n_res = len(res['Haps'])

        return res

    def impute_file(self, fname):
        print("Starting Imputation!\n")
        # TODO: do the right thing if its a gzip
        f = open(fname, 'r')
        filename = os.path.basename(fname)

        fout_name = 'output/' + filename + '_out'
        print("Output Imputation file: " + fout_name)
        fout = open(fout_name, 'w')

        fout1_name = 'output/' + filename + '_val'
        fout1 = open(fout1_name, 'w')

        fout2_name = 'output/' + filename + '_miss'
        fout2 = open(fout2_name, 'w')

        x = f.readlines()
        for i in range(len(x)):
            x[i] = x[i].strip('\n')
            name_gl = x[i].split('%')
            if (len(name_gl) == 2):
                res = self.comp_cand(name_gl[1], 0.0001)
                haps = res['Haps']
                probs = res['Probs']
                pops = res['Pops']
                print(i, "Subject:", name_gl[0], len(haps))
                if (len(haps) == 0):
                    fout2.write(x[i] + '\n')
                fout1.write(str(len(haps)) + ',' + str(name_gl[0]) + '\n')
                for j in range(len(haps)):
                    # Write the next format:ID, Haplotype1, Probability 1, Race 1, Haplotype2, Probability 2, Race 2
                    # No header
                    fout.write(
                        str(name_gl[0]) + ',' + str(haps[j][0]) + ',' +
                        str(probs[j][0]) + ',' + str(pops[j][0]) + ',' +
                        str(haps[j][1]) + ',' + str(probs[j][1]) + ',' +
                        str(pops[j][1]) + '\n')
        f.close()
        fout.close()
        fout1.close()
        fout2.close()
Esempio n. 8
0
class Imputation(object):
    '''
    classdocs
    '''

    def __init__(self, graph,verbose=0):
        """Constructor
        Intialize an instance of `Imputation` with a py2neo graph
        and a `CypherQuery` object.
        """
        self.graph = graph
        self.cypher = CypherQuery()

    def power_find(self, n):
        """produces all powers of 2
        """
        result = []
        binary = bin(n)[:1:-1]
        for x in range(len(binary)):
            if int(binary[x]):
                result.append(x)
        return result

    def gl2haps(self, GL_String):
        # Receives a GL string adn produces a genotype in list structure
        split_hap = GL_String.split('^')
        N_Loci = len(split_hap)
        t1 = []
        t2 = []
        for i in range(N_Loci):
            curr_locus = split_hap[i].split('+')
            t1.append(curr_locus[0])
            t2.append(curr_locus[1])
        Gen = [t1, t2]
        return {'Genotype': Gen, 'N_Loc': N_Loci}

    def gen_phases(self, gen, n_loci):
        # Generates all phases, but does not handle locus ambiguities
        Phases = []
        N_Phases = 2 ** (n_loci - 1)  # Total Number of phases
        exists = {}
        for i in range(0, N_Phases):
            H1 = []  # Hap lists
            H2 = []
            M1 = self.power_find(i)  # find all the powers of 2 in i
            L = [0] * n_loci  # Iitiated at 0 for all loci
            for m in M1:
                L[m] = 1
                # take a phase and set it to 1,
                # all others go to the other phase.
            for k in range(n_loci):
                H1.append(gen[L[k]][k])
                H2.append(gen[1 - L[k]][k])
            geno = "^".join(["~".join(sorted(H1)), "~".join(sorted(H2))])
            if geno not in exists:
                exists[geno] = 1
                Phases.append([sorted(H1), sorted(H2)])

        return {'Phases': Phases, 'N_Phases': N_Phases}

    def open_ambiguities(self, hap, loc):
        # This opens all allele ambiguities
        hap_new = []
        for k in range(len(hap)):
            split_loc = hap[k][loc].split('/')
            hap1 = hap[k]
            if len(split_loc) > 1:
                for i in range(len(split_loc)):
                    hap1[loc] = split_loc[i]
                    hap_new.append(hap1[:])
            else:
                hap_new.append(hap1[:])
        return hap_new

    def comp_hap_prob(self, Hap, N_Loc, epsilon):
        haplo_probs = self.get_haplo_freqs(Hap, epsilon)
        probs = list(haplo_probs.values())
        haplos = list(haplo_probs.keys())
        #print('After analysis I get',probs)
        if not haplo_probs:
            return {'Haps': '', 'Probs': ''}
        return {'Haps': haplos, 'Probs': probs}

    def get_haplo_freqs(self, haplos, epsilon):
        haplo_probs={}
        all_hap=[]
        for hap_cand in haplos:
            haplos_joined = ["~".join(sorted(hap)) for hap in hap_cand]
            all_hap.append(haplos_joined)
        haplo_query1 = self.cypher.buildQuery(haplos_joined)
        fq = pa.DataFrame(self.graph.data(haplo_query1))
        if not fq.empty:
            freq1_dic = fq.set_index('abcqr.name')['abcqr.frequency'].to_dict()
            haplo_probs.update(freq1_dic)
        return haplo_probs

    def comp_phase_prob(self, phases, N_Loc, epsilon):
        # receives a list of phases and computes haps and
        # probabilties and accumulate cartesian product
        hap_total = []
        p_total = []
        Prob2=[]
        for i in range(len(phases)):
            P1 = self.comp_hap_prob(phases[i][0], N_Loc, epsilon)
            # This will open locus ambiguities and comp probabilities for Hap1
            Haps1 = P1['Haps']
            Prob1 = P1['Probs']
            if len(Prob1)>0:
                P2 = self.comp_hap_prob(phases[i][1], N_Loc, epsilon)
            # This will do the same for Hap 2;
                Haps2 = P2['Haps']
                Prob2 = P2['Probs']
            for h in range(len(Prob1)):
                for k in range(len(Prob2)):
                    p_gen = Prob1[h]*Prob2[k]
                    if (p_gen > epsilon):
                        hap_total.append([Haps1[h], Haps2[k]])
                        p_total.append(p_gen)
        return {'Haps': hap_total, 'Probs': p_total}

    def open_phases(self, haps, N_Loc):
        phases=[]
        for j in range(len(haps)):
            H1=[]
            H2=[]
            for k in range(2):
                hap_list=[]
                hap_list.append(haps[j][k])
                for i in range(N_Loc):
                    hap_list = self.open_ambiguities(hap_list, i)
                if (k == 0):
                    H1.append(hap_list)
                else:
                    H2.append(hap_list)
            phases.append([sorted(H1), sorted(H2)])
        return phases

    def comp_cand(self, gl_string,epsilon=0.0001):
        # receives a list of phases and computes haps and
        # probabilties and accumulate cartesian productEpsilon=0.0001
        chr = self.gl2haps(gl_string)
        chr1 = self.gen_phases(chr['Genotype'], chr['N_Loc'])
        phases=self.open_phases(chr1['Phases'],chr['N_Loc'])
        n_res = 0
        min_res = 10
        min_epsilon = 1.e-3
        res = {'Haps': 'NaN', 'Probs': 0}
        while (epsilon > 0) & (n_res < min_res):
            epsilon /= 10
            if (epsilon < min_epsilon):
                epsilon = 0.0
            res = self.comp_phase_prob(phases,  chr['N_Loc'], epsilon)
            n_res = len(res['Haps'])

        return res

    def impute_file(self,fname):
        f = open(fname, 'r')
        fout_name=fname+'_out'
        fout=open(fout_name,'w')
        fout1_name=fname+'_val'
        fout1=open(fout1_name,'w')
        fout2_name=fname+'_miss'
        fout2=open(fout2_name,'w')

        x= f.readlines()
        for i in range(len(x)):
            x[i]=x[i].strip('\n')
            name_gl = x[i].split('%')
            if (len(name_gl)==2):
                res=self.comp_cand(name_gl[1],0.0001)
                m=res['Haps']
                m1=res['Probs']
                print(len(m),name_gl[0])
                if(len(m)==0):
                    fout2.write(x[i]+'\n')
                fout1.write(str(len(m))+','+str(name_gl[0])+'\n')
                for j in range(len(m)):
                    fout.write(str(name_gl[0])+',' + str(m[j])+','+str(m1[j])+'\n')
        f.close()
        fout.close()
        fout1.close()
        fout2.close()