def test_containing(intervals, interval): start, stop = interval # Intervals completely containing the query interval. containing = set((x, y) for x, y in intervals if x <= start and stop <= y) # Pre-selection of intervals using binning. binned = set((x, y) for x, y in intervals if binning.assign_bin(x, y) in binning.containing_bins(start, stop)) assert binned.issuperset(containing)
def annotate(input_handle, output_handle, ref, alt): connection = connector.connect(user='******', host='genome-euro-mysql.soe.ucsc.edu', port=3306, database='hg38') cursor = connection.cursor() input_handle.readline() reader = DictReader(input_handle, fieldnames=['chrom', 'start', 'end'], delimiter='\t') output_handle.write('{}\n'.format('\t'.join([ 'chrom', 'start', 'end', 'ref', 'alt', 'alleles', 'frequencies', 'transcripts', 'genes', 'phenotype' ]))) for line in reader: bins = containing_bins(int(line['start'])) query = ('SELECT name, name2 from refGene WHERE ' + 'chrom = "{0}" AND bin IN ({1}) AND ' + 'txStart <= {2} AND txEnd >= {2}').format( line['chrom'], ', '.join(map(str, bins)), line['start']) cursor.execute(query) names = list(map(set, (list(zip(*cursor))))) or [set([]), set([])] diseases = [] for name in names[1]: response = request( 'GET', 'https://www.disgenet.org/api/gda/gene/{}'.format(name)) if response.ok: diseases += [x['disease_name'] for x in response.json()] query = ('SELECT alleles, alleleFreqs FROM snp151 WHERE ' + 'chrom = "{}" AND bin IN ({}) AND chromStart = {}').format( line['chrom'], ', '.join(map(str, bins)), line['start']) cursor.execute(query) result = list( map(lambda x: ';'.join(map(lambda y: y.decode().strip(','), x)), zip(*cursor))) or ['', ''] output_handle.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( '\t'.join(line.values()), ref, alt, '\t'.join(result), '\t'.join(map(lambda x: ','.join(x), names)), ';'.join(diseases))) cursor.close() connection.close()
def calculate_frequency(chromosome, position, reference, observed, samples=None): """ Calculate frequency for a variant within a set of samples. :arg chromosome: Chromosome name. :type chromosome: str :arg position: One-based position where `reference` and `observed` start on the reference genome :type position: int :arg reference: Reference sequence. :type reference: str :arg observed: Observed sequence. :type observed: str :arg samples: Calculate the frequency within these samples. :type samples: list of Sample :return: A tuple of the number of individuals having coverage and a dictionary with for every zygosity the ratio of individuals with observed allele and zygosity. :rtype: (int, dict) """ samples = samples or [] # Todo: Use constant definition for zygosity, probably shared with the # one used in the models. zygosities = (None, 'homozygous', 'heterozygous') end_position = position + max(1, len(reference)) - 1 bins = binning.containing_bins(position - 1, end_position) # Coverage over samples with coverage profile. coverage = Region.query.join(Coverage).filter( Region.bin.in_(bins), Region.chromosome == chromosome, Region.begin <= position, Region.end >= end_position, Coverage.sample_id.in_(sample.id for sample in samples if sample.coverage_profile) ).count() # Add the number of individuals in samples without coverage profile. coverage += sum(sample.pool_size for sample in samples if not sample.coverage_profile) if not coverage: return 0, {zygosity: 0 for zygosity in zygosities} # Counts of observations per zygosity. counts = db.session.query( Observation.zygosity, func.sum(Observation.support) ).join(Variation).filter( Observation.bin.in_(bins), Observation.chromosome == chromosome, Observation.position == position, Observation.reference == reference, Observation.observed == observed, Variation.sample_id.in_(sample.id for sample in samples) ).group_by(Observation.zygosity) counts = collections.Counter(dict(counts)) frequency = {zygosity: counts[zygosity] / coverage for zygosity in zygosities} return coverage, frequency
def test_containing_bins(start, stop, expected): assert binning.containing_bins(start, stop) == expected