Esempio n. 1
0
def multiset_banded_genotype_combinations(sample_genotypes, bandwidth):
    for index_combo in multiset.multichoose(len(samples), range(bandwidth)):
        for index_permutation in multiset.permutations(index_combo):
            yield [
                genotypes[index] for index, genotypes in zip(
                    index_permutation, sample_genotypes)
            ]
Esempio n. 2
0
def data_likelihood_exact(genotype, observed_alleles):
    """'Exact' data likelihood, sum of sampling probability * join Q score for
    the observed alleles over all possible underlying 'true allele'
    combinations."""
    #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype
    observation_count = len(observed_alleles)
    ploidy = sum([count for allele, count in genotype])
    allele_probs = [count / float(ploidy) for allele, count in genotype]
    probs = []
    # for all true allele combinations X permutations
    for true_allele_combination in multiset.multichoose(observation_count, [x[0] for x in genotype]):
        for true_allele_permutation in multiset.permutations(true_allele_combination):
            # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records
            true_alleles = [{'alt':allele} for allele in true_allele_permutation]
            allele_groups = group_alleles(true_alleles)
            observations = []
            for allele, count in genotype:
                if allele_groups.has_key(allele):
                    observations.append(len(allele_groups[allele]))
                else:
                    observations.append(0)
            #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here
            lnsampling_prob = multinomialln(allele_probs, observations)
            prob = lnsampling_prob + likelihood_given_true_alleles(observed_alleles, true_alleles)
            #print math.exp(prob), sprob, genotype, true_allele_permutation
            #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles]
            probs.append(prob)
    # sum the individual probability of all combinations
    p = logsumexp(probs)
    #print math.exp(p)
    return p
Esempio n. 3
0
def data_likelihood_exact(genotype, observed_alleles):
    """'Exact' data likelihood, sum of sampling probability * join Q score for
    the observed alleles over all possible underlying 'true allele'
    combinations."""
    #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype
    observation_count = len(observed_alleles)
    ploidy = sum([count for allele, count in genotype])
    allele_probs = [count / float(ploidy) for allele, count in genotype]
    probs = []
    # for all true allele combinations X permutations
    for true_allele_combination in multiset.multichoose(
            observation_count, [x[0] for x in genotype]):
        for true_allele_permutation in multiset.permutations(
                true_allele_combination):
            # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records
            true_alleles = [{
                'alt': allele
            } for allele in true_allele_permutation]
            allele_groups = group_alleles(true_alleles)
            observations = []
            for allele, count in genotype:
                if allele_groups.has_key(allele):
                    observations.append(len(allele_groups[allele]))
                else:
                    observations.append(0)
            #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here
            lnsampling_prob = multinomialln(allele_probs, observations)
            prob = lnsampling_prob + likelihood_given_true_alleles(
                observed_alleles, true_alleles)
            #print math.exp(prob), sprob, genotype, true_allele_permutation
            #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles]
            probs.append(prob)
    # sum the individual probability of all combinations
    p = logsumexp(probs)
    #print math.exp(p)
    return p
Esempio n. 4
0
                    genotypes) in zip(index_permutation, sample_genotypes)]


def genotype_str(genotype):
    return fold(operator.add, [allele * count for allele, count in genotype])


if __name__ == '__main__':

    ploidy = 2  # assume ploidy 2 for all individuals and all positions

    potential_alleles = ['A', 'T', 'G', 'C']

    # genotypes are expressed as sets of allele frequencies
    genotypes = list_genotypes_to_count_genotypes(
        list(multiset.multichoose(ploidy, potential_alleles)))

    for line in sys.stdin:
        position = cjson.decode(line)
        #print position['position']
        samples = position['samples']

        position['coverage'] = sum([
            len(sample['alleles'])
            for samplename, sample in samples.iteritems()
        ])

        #potential_alleles = ['A','T','G','C']
        potential_alleles = set()
        for samplename, sample in samples.items():
            # only process snps and reference alleles
Esempio n. 5
0
def multiset_banded_genotype_combinations(sample_genotypes, bandwidth):
    for index_combo in multiset.multichoose(len(samples), range(bandwidth)):
        for index_permutation in multiset.permutations(index_combo):
            yield [genotypes[index] for index, genotypes in zip(index_permutation, sample_genotypes)]
Esempio n. 6
0
        for j in range(1, band_depth):  # band_depth is the depth to which we explore the bandwith... TODO explain better
            indexes = j * [i] + (len(sample_genotypes) - j) * [0]
            for index_permutation in multiset.permutations(indexes):
                yield [(sample, genotypes[index]) for index, (sample, genotypes) in zip(index_permutation, sample_genotypes)]

def genotype_str(genotype):
    return reduce(operator.add, [allele * count for allele, count in genotype])

if __name__ == '__main__':

    ploidy = 2 # assume ploidy 2 for all individuals and all positions

    potential_alleles = ['A','T','G','C']

    # genotypes are expressed as sets of allele frequencies
    genotypes = list_genotypes_to_count_genotypes(list(multiset.multichoose(ploidy, potential_alleles)))

    for line in sys.stdin:
        position = cjson.decode(line)
        #print position['position']
        samples = position['samples']

        position['coverage'] = sum([len(sample['alleles']) for samplename, sample in samples.iteritems()])

        #potential_alleles = ['A','T','G','C']
        potential_alleles = set()
        for samplename, sample in samples.items():
            # only process snps and reference alleles
            alleles = [allele for allele in sample['alleles'] if allele['type'] in ['reference', 'snp']]
            alleles = alleles_quality_to_lnprob(alleles)
            sample['alleles'] = alleles