def test_containing(intervals, interval):
    start, stop = interval

    # Intervals completely containing the query interval.
    containing = set((x, y) for x, y in intervals
                     if x <= start and stop <= y)

    # Pre-selection of intervals using binning.
    binned = set((x, y) for x, y in intervals
                 if binning.assign_bin(x, y)
                 in binning.containing_bins(start, stop))

    assert binned.issuperset(containing)
예제 #2
0
def annotate(input_handle, output_handle, ref, alt):
    connection = connector.connect(user='******',
                                   host='genome-euro-mysql.soe.ucsc.edu',
                                   port=3306,
                                   database='hg38')
    cursor = connection.cursor()

    input_handle.readline()
    reader = DictReader(input_handle,
                        fieldnames=['chrom', 'start', 'end'],
                        delimiter='\t')
    output_handle.write('{}\n'.format('\t'.join([
        'chrom', 'start', 'end', 'ref', 'alt', 'alleles', 'frequencies',
        'transcripts', 'genes', 'phenotype'
    ])))
    for line in reader:
        bins = containing_bins(int(line['start']))
        query = ('SELECT name, name2 from refGene WHERE ' +
                 'chrom = "{0}" AND bin IN ({1}) AND ' +
                 'txStart <= {2} AND txEnd >= {2}').format(
                     line['chrom'], ', '.join(map(str, bins)), line['start'])
        cursor.execute(query)
        names = list(map(set, (list(zip(*cursor))))) or [set([]), set([])]

        diseases = []
        for name in names[1]:
            response = request(
                'GET', 'https://www.disgenet.org/api/gda/gene/{}'.format(name))
            if response.ok:
                diseases += [x['disease_name'] for x in response.json()]

        query = ('SELECT alleles, alleleFreqs FROM snp151 WHERE ' +
                 'chrom = "{}" AND bin IN ({}) AND chromStart = {}').format(
                     line['chrom'], ', '.join(map(str, bins)), line['start'])
        cursor.execute(query)
        result = list(
            map(lambda x: ';'.join(map(lambda y: y.decode().strip(','), x)),
                zip(*cursor))) or ['', '']

        output_handle.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
            '\t'.join(line.values()), ref, alt, '\t'.join(result),
            '\t'.join(map(lambda x: ','.join(x), names)), ';'.join(diseases)))

    cursor.close()
    connection.close()
예제 #3
0
파일: utils.py 프로젝트: varda/varda
def calculate_frequency(chromosome, position, reference, observed,
                        samples=None):
    """
    Calculate frequency for a variant within a set of samples.

    :arg chromosome: Chromosome name.
    :type chromosome: str
    :arg position: One-based position where `reference` and `observed` start
        on the reference genome
    :type position: int
    :arg reference: Reference sequence.
    :type reference: str
    :arg observed: Observed sequence.
    :type observed: str
    :arg samples: Calculate the frequency within these samples.
    :type samples: list of Sample

    :return: A tuple of the number of individuals having coverage and a
        dictionary with for every zygosity the ratio of individuals with
        observed allele and zygosity.
    :rtype: (int, dict)
    """
    samples = samples or []

    # Todo: Use constant definition for zygosity, probably shared with the
    #     one used in the models.
    zygosities = (None, 'homozygous', 'heterozygous')

    end_position = position + max(1, len(reference)) - 1
    bins = binning.containing_bins(position - 1, end_position)

    # Coverage over samples with coverage profile.
    coverage = Region.query.join(Coverage).filter(
        Region.bin.in_(bins),
        Region.chromosome == chromosome,
        Region.begin <= position,
        Region.end >= end_position,
        Coverage.sample_id.in_(sample.id for sample in samples
                               if sample.coverage_profile)
    ).count()

    # Add the number of individuals in samples without coverage profile.
    coverage += sum(sample.pool_size for sample in samples
                    if not sample.coverage_profile)

    if not coverage:
        return 0, {zygosity: 0 for zygosity in zygosities}

    # Counts of observations per zygosity.
    counts = db.session.query(
        Observation.zygosity,
        func.sum(Observation.support)
    ).join(Variation).filter(
        Observation.bin.in_(bins),
        Observation.chromosome == chromosome,
        Observation.position == position,
        Observation.reference == reference,
        Observation.observed == observed,
        Variation.sample_id.in_(sample.id for sample in samples)
    ).group_by(Observation.zygosity)

    counts = collections.Counter(dict(counts))

    frequency = {zygosity: counts[zygosity] / coverage
                 for zygosity in zygosities}

    return coverage, frequency
예제 #4
0
def test_containing_bins(start, stop, expected):
    assert binning.containing_bins(start, stop) == expected