def test_contained(intervals, interval): start, stop = interval # Intervals completely contained by the query interval. contained = set((x, y) for x, y in intervals if start <= x and y <= stop) # Pre-selection of intervals using binning. binned = set((x, y) for x, y in intervals if binning.assign_bin(x, y) in binning.contained_bins(start, stop)) assert binned.issuperset(contained)
def list_view(cls, begin, count, region, queries=None, order=None): """ Returns a collection of variants in the `variant_collection` field. """ queries = queries or [] # Todo: Document that `begin` and `end` are 1-based and inclusive. Or, # perhaps we should change that to conform to BED track regions. try: chromosome, begin_position, end_position = normalize_region( region['chromosome'], region['begin'], region['end']) except ReferenceMismatch as e: raise ValidationError(str(e)) for query in queries: if query.singleton: query.require_active = False query.require_coverage_profile = False _authorize_query(query) # Set of samples IDs considered by all queries together. all_sample_ids = {sample.id for query in queries for sample in query.samples} # Set of observations considered by all queries together. bins = binning.contained_bins(begin_position - 1, end_position) observations = Observation.query.filter( Observation.chromosome == chromosome, Observation.position >= begin_position, Observation.position <= end_position, Observation.bin.in_(bins) ).join(Variation).join(Sample).filter( Sample.id.in_(all_sample_ids) ).distinct( Observation.chromosome, Observation.position, Observation.reference, Observation.observed ).order_by( *[getattr(getattr(Observation, f), d)() for f, d in cls.get_order(order)]) items = [cls.serialize((o.chromosome, o.position, o.reference, o.observed), queries=queries) for o in observations.limit(count).offset(begin)] return (observations.count(), jsonify(variant_collection={'uri': cls.collection_uri(), 'items': items}))
def test_contained_bins(start, stop, expected): assert binning.contained_bins(start, stop) == expected
def annotate_regions(original_regions, annotated_variants, original_filetype='bed', annotated_filetype='csv', queries=None, original_records=1): """ Read regions from a file and write variant frequencies to another file. :arg original_regions: Open handle to a file with regions. :type original_regions: file-like object :arg annotated_variants: Open handle to write annotated variants to. :type annotated_vairants: file-like object :kwarg original_filetype: Filetype for variants (currently only ``bed`` allowed). :type original_filetype: str :kwarg annotated_filetype: Filetype for annotated variants (currently only ``csv`` allowed). :type annotated_filetype: str :arg queries: List of sample queries to compute frequencies over. :type queries: list of Query :arg original_records: Number of records in original regions file. :type original_records: int The output file contains the following columns for information on each variant: - ``CHROMOSOME``: Chromosome name in the reference genome. - ``POSITION``: One-based position of ``REFERENCE`` and ``OBSERVED`` on ``CHROMOSOME``. - ``REFERENCE``: Reference allele. - ``OBSERVED``: Observed (alternate) allele. Frequency information is annotated using several additional columns in the output file. For each query, we use the following columns, where the ``<Q>`` prefix is the query name: - ``<Q>_VN``: For each alternate allele, the number of individuals used for calculating ``<Q>_VF``, i.e., the number of individuals that have this region covered. - ``<Q>_VF``: For each alternate allele, the observed frequency, i.e., the ratio of individuals in which the allele was observed. - ``<Q>_VF_HET``: For each alternate allele, the observed heterozygous frequency, i.e., the ratio of individuals in which the allele was observed heterozygous. - ``<Q>_VF_HOM``: For each alternate allele, the observed homozygous frequency, i.e., the ratio of individuals in which the allele was observed homozygous. Note that the ``<Q>_VF_HET`` and ``<Q>_VF_HOM`` values for a particular alternate allele might not add up to the ``<Q>_VF`` value, since there can be observations where the exact genotype is unknown. If the query specifies exactly one sample and that sample does not have coverage information, ``<Q>_VN`` is simply the number of individuals contained in the sample. """ queries = queries or [] if original_filetype != 'bed': raise ReadError('Original data must be in BED format') if annotated_filetype != 'csv': raise ReadError('Annotated data must be in CSV format') # Set of samples IDs that are considered by all queries together. all_sample_ids = {sample.id for query in queries for sample in query.samples} header_fields = ['CHROMOSOME', 'POSITION', 'REFERENCE', 'OBSERVED'] # Header lines in CSV output for each query. for query in queries: header_fields.extend([query.name + '_VN', query.name + '_VF', query.name + '_VF_HET', query.name + '_VF_HOM']) description = ('Number of individuals in %s having this region covered' % query.name) if not query.require_coverage_profile: description += ' (or without coverage profile)' description += ' (out of %i considered).' % sum(sample.pool_size for sample in query.samples) # TODO: If it is a singleton query, removing the "... having this # region covered ..." part. annotated_variants.write( '##' + query.name + '_VN: %s.\n' % description) annotated_variants.write( '##' + query.name + '_VF: Ratio of individuals in %s in which the ' 'allele was observed.\n' % query.name) annotated_variants.write( '##' + query.name + '_VF_HET: Ratio of individuals in %s in which the ' 'allele was observed as heterozygous.\n' % query.name) annotated_variants.write( '##' + query.name + '_VF_HOM: Ratio of individuals in %s in which the ' 'allele was observed as homozygous.\n' % query.name) annotated_variants.write('#' + '\t'.join(header_fields) + '\n') old_percentage = -1 for current_record, chromosome, begin, end in read_regions(original_regions): percentage = min(int(current_record / original_records * 100), 99) if percentage > old_percentage: # Todo: Task state updating should be defined in the task itself, # perhaps we can give values using a callback. try: current_task.update_state(state='PROGRESS', meta={'percentage': percentage}) except AttributeError: # Hack for the unit tests were whe call this not from within # a task. pass old_percentage = percentage results = [[] for _ in queries] # Set of observations considered by all queries together. bins = binning.contained_bins(begin - 1, end) observations = Observation.query.filter( Observation.chromosome == chromosome, Observation.position >= begin, Observation.position <= end, Observation.bin.in_(bins) ).join(Variation).join(Sample).filter( Sample.id.in_(all_sample_ids) ).distinct( Observation.chromosome, Observation.position, Observation.reference, Observation.observed ).order_by( Observation.chromosome, Observation.position, Observation.reference, Observation.observed, Observation.id ) for observation in observations: fields = [observation.chromosome, observation.position, observation.reference, observation.observed] for query in enumerate(queries): vn, vf = calculate_frequency(observation.chromosome, observation.position, observation.reference, observation.observed, samples=query.samples) fields.extend([vn, sum(vf.values()), vf['heterozygous'], vf['homozygous']]) # Todo: Stringify per value, not in one sweep. annotated_variants.write('\t'.join(str(f) for f in fields) + '\n')