def _gaFeatureForFeatureDbRecord(self, feature): """ :param feature: The DB Row representing a feature :return: the corresponding GA4GH protocol.Feature object """ gaFeature = protocol.Feature() gaFeature.id = self.getCompoundIdForFeatureId(feature['id']) if feature.get('parent_id'): gaFeature.parent_id = self.getCompoundIdForFeatureId( feature['parent_id']) else: gaFeature.parent_id = "" gaFeature.feature_set_id = self.getId() gaFeature.reference_name = pb.string(feature.get('reference_name')) gaFeature.start = pb.int(feature.get('start')) gaFeature.end = pb.int(feature.get('end')) gaFeature.name = pb.string(feature.get('name')) if feature.get('strand', '') == '-': gaFeature.strand = protocol.NEG_STRAND else: # default to positive strand gaFeature.strand = protocol.POS_STRAND gaFeature.child_ids.extend(map( self.getCompoundIdForFeatureId, json.loads(feature['child_ids']))) gaFeature.feature_type.CopyFrom( self._ontology.getGaTermByName(feature['type'])) attributes = json.loads(feature['attributes']) # TODO: Identify which values are ExternalIdentifiers and OntologyTerms for key in attributes: for v in attributes[key]: gaFeature.attributes.vals[key].values.add().string_value = v if 'gene_name' in attributes and len(attributes['gene_name']) > 0: gaFeature.gene_symbol = pb.string(attributes['gene_name'][0]) return gaFeature
def search_reference_sets(self, accession=None, md5checksum=None, assembly_id=None): """ Returns an iterator over the ReferenceSets fulfilling the specified conditions. :param str accession: If not null, return the reference sets for which the `accession` matches this string (case-sensitive, exact match). :param str md5checksum: If not null, return the reference sets for which the `md5checksum` matches this string (case-sensitive, exact match). See :class:`ga4gh.protocol.ReferenceSet::md5checksum` for details. :param str assembly_id: If not null, return the reference sets for which the `assembly_id` matches this string (case-sensitive, exact match). :return: An iterator over the :class:`ga4gh.protocol.ReferenceSet` objects defined by the query parameters. """ request = protocol.SearchReferenceSetsRequest() request.accession = pb.string(accession) request.md5checksum = pb.string(md5checksum) request.assembly_id = pb.string(assembly_id) request.page_size = pb.int(self._page_size) return self._run_search_request(request, "referencesets", protocol.SearchReferenceSetsResponse)
def search_reference_sets( self, accession=None, md5checksum=None, assembly_id=None): """ Returns an iterator over the ReferenceSets fulfilling the specified conditions. :param str accession: If not null, return the reference sets for which the `accession` matches this string (case-sensitive, exact match). :param str md5checksum: If not null, return the reference sets for which the `md5checksum` matches this string (case-sensitive, exact match). See :class:`ga4gh.protocol.ReferenceSet::md5checksum` for details. :param str assembly_id: If not null, return the reference sets for which the `assembly_id` matches this string (case-sensitive, exact match). :return: An iterator over the :class:`ga4gh.protocol.ReferenceSet` objects defined by the query parameters. """ request = protocol.SearchReferenceSetsRequest() request.accession = pb.string(accession) request.md5checksum = pb.string(md5checksum) request.assembly_id = pb.string(assembly_id) request.page_size = pb.int(self._page_size) return self._run_search_request( request, "referencesets", protocol.SearchReferenceSetsResponse)
def toProtocolElement(self): dataset = protocol.Dataset() dataset.id = self.getId() dataset.name = pb.string(self.getLocalId()) dataset.description = pb.string(self.getDescription()) for key in self.getInfo(): dataset.info[key].values.extend(_encodeValue(self._info[key])) return dataset
def toProtocolElement(self): """ Returns the GA4GH protocol representation of this ReferenceSet. """ ret = protocol.ReferenceSet() ret.assembly_id = pb.string(self.getAssemblyId()) ret.description = pb.string(self.getDescription()) ret.id = self.getId() ret.is_derived = self.getIsDerived() ret.md5checksum = self.getMd5Checksum() ret.ncbi_taxon_id = pb.int(self.getNcbiTaxonId()) ret.source_accessions.extend(self.getSourceAccessions()) ret.source_uri = pb.string(self.getSourceUri()) ret.name = self.getLocalId() return ret
def search_reads( self, read_group_ids, reference_id=None, start=None, end=None): """ Returns an iterator over the Reads fulfilling the specified conditions from the specified read_group_ids. :param str read_group_ids: The IDs of the :class:`ga4gh.protocol.ReadGroup` of interest. :param str reference_id: The name of the :class:`ga4gh.protocol.Reference` we wish to return reads mapped to. :param int start: The start position (0-based) of this query. If a reference is specified, this defaults to 0. Genomic positions are non-negative integers less than reference length. Requests spanning the join of circular genomes are represented as two requests one on each side of the join (position 0). :param int end: The end position (0-based, exclusive) of this query. If a reference is specified, this defaults to the reference's length. :return: An iterator over the :class:`ga4gh.protocol.ReadAlignment` objects defined by the query parameters. :rtype: iter """ request = protocol.SearchReadsRequest() request.read_group_ids.extend(read_group_ids) request.reference_id = pb.string(reference_id) request.start = pb.int(start) request.end = pb.int(end) request.page_size = pb.int(self._page_size) return self._run_search_request( request, "reads", protocol.SearchReadsResponse)
def testToProtocolElement(self): dataset = datasets.Dataset('dataset1') term = protocol.OntologyTerm() term.term = "male genotypic sex" term.id = "PATO:0020001" term.source_name = "PATO" term.source_version = pb.string("2015-11-18") # Write out a valid input print(protocol.toJsonDict(term)) validIndividual = protocol.Individual( name="test", created="2016-05-19T21:00:19Z", updated="2016-05-19T21:00:19Z", sex=term) validIndividual.info['test'].values.add().string_value = 'test-info' # pass through protocol creation individual = bioMetadata.Individual( dataset, "test") individual.populateFromJson(protocol.toJson(validIndividual)) gaIndividual = individual.toProtocolElement() # Verify elements exist self.assertEqual(gaIndividual.created, validIndividual.created) self.assertEqual(gaIndividual.updated, validIndividual.updated) # Invalid input invalidIndividual = '{"bad:", "json"}' individual = bioMetadata.Individual(dataset, "test") # Should fail self.assertRaises( exceptions.InvalidJsonException, individual.populateFromJson, invalidIndividual)
def testToProtocolElement(self): dataset = datasets.Dataset('dataset1') term = protocol.OntologyTerm() term.term = "male genotypic sex" term.id = "PATO:0020001" term.source_name = "PATO" term.source_version = pb.string("2015-11-18") # Write out a valid input print(protocol.toJsonDict(term)) validIndividual = protocol.Individual(name="test", created="2016-05-19T21:00:19Z", updated="2016-05-19T21:00:19Z", sex=term) validIndividual.info['test'].values.add().string_value = 'test-info' # pass through protocol creation individual = bioMetadata.Individual(dataset, "test") individual.populateFromJson(protocol.toJson(validIndividual)) gaIndividual = individual.toProtocolElement() # Verify elements exist self.assertEqual(gaIndividual.created, validIndividual.created) self.assertEqual(gaIndividual.updated, validIndividual.updated) # Invalid input invalidIndividual = '{"bad:", "json"}' individual = bioMetadata.Individual(dataset, "test") # Should fail self.assertRaises(exceptions.InvalidJsonException, individual.populateFromJson, invalidIndividual)
def searchReads( self, readGroupIds, referenceId=None, start=None, end=None): """ Returns an iterator over the Reads fulfilling the specified conditions from the specified ReadGroupIds. :param str readGroupIds: The IDs of the :class:`ga4gh.protocol.ReadGroup` of interest. :param str referenceId: The name of the :class:`ga4gh.protocol.Reference` we wish to return reads mapped to. :param int start: The start position (0-based) of this query. If a reference is specified, this defaults to 0. Genomic positions are non-negative integers less than reference length. Requests spanning the join of circular genomes are represented as two requests one on each side of the join (position 0). :param int end: The end position (0-based, exclusive) of this query. If a reference is specified, this defaults to the reference's length. :return: An iterator over the :class:`ga4gh.protocol.ReadAlignment` objects defined by the query parameters. :rtype: iter """ request = protocol.SearchReadsRequest() request.read_group_ids.extend(readGroupIds) request.reference_id = pb.string(referenceId) request.start = pb.int(start) request.end = pb.int(end) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "reads", protocol.SearchReadsResponse)
def getSerializedResponse(self): """ Returns a string version of the SearchResponse that has been built by this SearchResponseBuilder. """ self._protoObject.next_page_token = pb.string(self._nextPageToken) s = toJson(self._protoObject) return s
def searchReadGroupSets(self, datasetId, name=None, bioSampleId=None): """ Returns an iterator over the ReadGroupSets fulfilling the specified conditions from the specified Dataset. :param str name: Only ReadGroupSets matching the specified name will be returned. :param str bioSampleId: Only ReadGroups matching the specified bioSample will be included in the response. :return: An iterator over the :class:`ga4gh.protocol.ReadGroupSet` objects defined by the query parameters. :rtype: iter """ request = protocol.SearchReadGroupSetsRequest() request.dataset_id = datasetId request.name = pb.string(name) request.bio_sample_id = pb.string(bioSampleId) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "readgroupsets", protocol.SearchReadGroupSetsResponse)
def searchBioSamples(self, datasetId, name=None, individualId=None): """ Returns an iterator over the BioSamples fulfilling the specified conditions. :param str datasetId: The dataset to search within. :param str name: Only BioSamples matching the specified name will be returned. :param str individualId: Only BioSamples matching matching this id will be returned. :return: An iterator over the :class:`ga4gh.protocol.BioSample` objects defined by the query parameters. """ request = protocol.SearchBioSamplesRequest() request.dataset_id = datasetId request.name = pb.string(name) request.individual_id = pb.string(individualId) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "biosamples", protocol.SearchBioSamplesResponse)
def search_bio_samples(self, dataset_id, name=None, individual_id=None): """ Returns an iterator over the BioSamples fulfilling the specified conditions. :param str dataset_id: The dataset to search within. :param str name: Only BioSamples matching the specified name will be returned. :param str individual_id: Only BioSamples matching matching this id will be returned. :return: An iterator over the :class:`ga4gh.protocol.BioSample` objects defined by the query parameters. """ request = protocol.SearchBioSamplesRequest() request.dataset_id = dataset_id request.name = pb.string(name) request.individual_id = pb.string(individual_id) request.page_size = pb.int(self._page_size) return self._run_search_request( request, "biosamples", protocol.SearchBioSamplesResponse)
def search_call_sets(self, variant_set_id, name=None, bio_sample_id=None): """ Returns an iterator over the CallSets fulfilling the specified conditions from the specified VariantSet. :param str variant_set_id: Find callsets belonging to the provided variant set. :param str name: Only CallSets matching the specified name will be returned. :param str bio_sample_id: Only CallSets matching this id will be returned. :return: An iterator over the :class:`ga4gh.protocol.CallSet` objects defined by the query parameters. """ request = protocol.SearchCallSetsRequest() request.variant_set_id = variant_set_id request.name = pb.string(name) request.bio_sample_id = pb.string(bio_sample_id) request.page_size = pb.int(self._page_size) return self._run_search_request( request, "callsets", protocol.SearchCallSetsResponse)
def searchCallSets(self, variantSetId, name=None, bioSampleId=None): """ Returns an iterator over the CallSets fulfilling the specified conditions from the specified VariantSet. :param str variantSetId: Find callsets belonging to the provided variant set. :param str name: Only CallSets matching the specified name will be returned. :param str bioSampleId: Only CallSets matching this id will be returned. :return: An iterator over the :class:`ga4gh.protocol.CallSet` objects defined by the query parameters. """ request = protocol.SearchCallSetsRequest() request.variant_set_id = variantSetId request.name = pb.string(name) request.bio_sample_id = pb.string(bioSampleId) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "callsets", protocol.SearchCallSetsResponse)
def searchReferences( self, referenceSetId, accession=None, md5checksum=None): """ Returns an iterator over the References fulfilling the specified conditions from the specified Dataset. :param str referenceSetId: The ReferenceSet to search. :param str accession: If not None, return the references for which the `accession` matches this string (case-sensitive, exact match). :param str md5checksum: If not None, return the references for which the `md5checksum` matches this string (case-sensitive, exact match). :return: An iterator over the :class:`ga4gh.protocol.Reference` objects defined by the query parameters. """ request = protocol.SearchReferencesRequest() request.reference_set_id = referenceSetId request.accession = pb.string(accession) request.md5checksum = pb.string(md5checksum) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "references", protocol.SearchReferencesResponse)
def search_variants(self, variant_set_id, start=None, end=None, reference_name=None, call_set_ids=None): """ Returns an iterator over the Variants fulfilling the specified conditions from the specified VariantSet. :param str variant_set_id: The ID of the :class:`ga4gh.protocol.VariantSet` of interest. :param int start: Required. The beginning of the window (0-based, inclusive) for which overlapping variants should be returned. Genomic positions are non-negative integers less than reference length. Requests spanning the join of circular genomes are represented as two requests one on each side of the join (position 0). :param int end: Required. The end of the window (0-based, exclusive) for which overlapping variants should be returned. :param str reference_name: The name of the :class:`ga4gh.protocol.Reference` we wish to return variants from. :param list call_set_ids: Only return variant calls which belong to call sets with these IDs. If an empty array, returns variants without any call objects. If null, returns all variant calls. :return: An iterator over the :class:`ga4gh.protocol.Variant` objects defined by the query parameters. :rtype: iter """ request = protocol.SearchVariantsRequest() request.reference_name = pb.string(reference_name) request.start = pb.int(start) request.end = pb.int(end) request.variant_set_id = variant_set_id request.call_set_ids.extend(pb.string(call_set_ids)) request.page_size = pb.int(self._page_size) return self._run_search_request(request, "variants", protocol.SearchVariantsResponse)
def search_references( self, reference_set_id, accession=None, md5checksum=None): """ Returns an iterator over the References fulfilling the specified conditions from the specified Dataset. :param str reference_set_id: The ReferenceSet to search. :param str accession: If not None, return the references for which the `accession` matches this string (case-sensitive, exact match). :param str md5checksum: If not None, return the references for which the `md5checksum` matches this string (case-sensitive, exact match). :return: An iterator over the :class:`ga4gh.protocol.Reference` objects defined by the query parameters. """ request = protocol.SearchReferencesRequest() request.reference_set_id = reference_set_id request.accession = pb.string(accession) request.md5checksum = pb.string(md5checksum) request.page_size = pb.int(self._page_size) return self._run_search_request( request, "references", protocol.SearchReferencesResponse)
def toProtocolElement(self): """ Returns the representation of this FeatureSet as the corresponding ProtocolElement. """ gaFeatureSet = protocol.FeatureSet() gaFeatureSet.id = self.getId() gaFeatureSet.dataset_id = self.getParentContainer().getId() gaFeatureSet.reference_set_id = pb.string(self._referenceSet.getId()) gaFeatureSet.name = self._name gaFeatureSet.source_uri = self._sourceUri for key in self._info: gaFeatureSet.info[key].values.extend(self._info[key]) return gaFeatureSet
def toProtocolElement(self): """ Returns the GA4GH protocol representation of this ReadGroup. """ # TODO this is very incomplete, but we don't have the # implementation to fill out the rest of the fields currently readGroup = protocol.ReadGroup() readGroup.id = self.getId() readGroup.created = self._creationTime readGroup.updated = self._updateTime dataset = self.getParentContainer().getParentContainer() readGroup.dataset_id = dataset.getId() readGroup.name = self.getLocalId() readGroup.predicted_insert_size = pb.int(self.getPredictedInsertSize()) referenceSet = self._parentContainer.getReferenceSet() readGroup.sample_id = pb.string(self.getSampleId()) if referenceSet is not None: readGroup.reference_set_id = referenceSet.getId() readGroup.stats.CopyFrom(self.getStats()) readGroup.programs.extend(self.getPrograms()) readGroup.description = pb.string(self.getDescription()) readGroup.experiment.CopyFrom(self.getExperiment()) return readGroup
def toProtocolElement(self): """ Returns the GA4GH protocol representation of this ReadGroup. """ # TODO this is very incomplete, but we don't have the # implementation to fill out the rest of the fields currently readGroup = protocol.ReadGroup() readGroup.id = self.getId() readGroup.created = self._creationTime readGroup.updated = self._updateTime dataset = self.getParentContainer().getParentContainer() readGroup.dataset_id = dataset.getId() readGroup.name = self.getLocalId() readGroup.predicted_insert_size = pb.int(self.getPredictedInsertSize()) referenceSet = self._parentContainer.getReferenceSet() readGroup.sample_name = pb.string(self.getSampleName()) readGroup.bio_sample_id = pb.string(self.getBioSampleId()) if referenceSet is not None: readGroup.reference_set_id = referenceSet.getId() readGroup.stats.CopyFrom(self.getStats()) readGroup.programs.extend(self.getPrograms()) readGroup.description = pb.string(self.getDescription()) readGroup.experiment.CopyFrom(self.getExperiment()) return readGroup
def search_variants( self, variant_set_id, start=None, end=None, reference_name=None, call_set_ids=None): """ Returns an iterator over the Variants fulfilling the specified conditions from the specified VariantSet. :param str variant_set_id: The ID of the :class:`ga4gh.protocol.VariantSet` of interest. :param int start: Required. The beginning of the window (0-based, inclusive) for which overlapping variants should be returned. Genomic positions are non-negative integers less than reference length. Requests spanning the join of circular genomes are represented as two requests one on each side of the join (position 0). :param int end: Required. The end of the window (0-based, exclusive) for which overlapping variants should be returned. :param str reference_name: The name of the :class:`ga4gh.protocol.Reference` we wish to return variants from. :param list call_set_ids: Only return variant calls which belong to call sets with these IDs. If an empty array, returns variants without any call objects. If null, returns all variant calls. :return: An iterator over the :class:`ga4gh.protocol.Variant` objects defined by the query parameters. :rtype: iter """ request = protocol.SearchVariantsRequest() request.reference_name = pb.string(reference_name) request.start = pb.int(start) request.end = pb.int(end) request.variant_set_id = variant_set_id request.call_set_ids.extend(pb.string(call_set_ids)) request.page_size = pb.int(self._page_size) return self._run_search_request( request, "variants", protocol.SearchVariantsResponse)
def searchCallSets(self, variantSetId, name=None): """ Returns an iterator over the CallSets fulfilling the specified conditions from the specified VariantSet. :param str name: Only CallSets matching the specified name will be returned. :return: An iterator over the :class:`ga4gh.protocol.CallSet` objects defined by the query parameters. """ request = protocol.SearchCallSetsRequest() request.variant_set_id = variantSetId request.name = pb.string(name) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "callsets", protocol.SearchCallSetsResponse)
def search_individuals(self, dataset_id, name=None): """ Returns an iterator over the Individuals fulfilling the specified conditions. :param str dataset_id: The dataset to search within. :param str name: Only Individuals matching the specified name will be returned. :return: An iterator over the :class:`ga4gh.protocol.BioSample` objects defined by the query parameters. """ request = protocol.SearchIndividualsRequest() request.dataset_id = dataset_id request.name = pb.string(name) request.page_size = pb.int(self._page_size) return self._run_search_request(request, "individuals", protocol.SearchIndividualsResponse)
def searchIndividuals(self, datasetId, name=None): """ Returns an iterator over the Individuals fulfilling the specified conditions. :param str datasetId: The dataset to search within. :param str name: Only Individuals matching the specified name will be returned. :return: An iterator over the :class:`ga4gh.protocol.BioSample` objects defined by the query parameters. """ request = protocol.SearchIndividualsRequest() request.dataset_id = datasetId request.name = pb.string(name) request.page_size = pb.int(self._pageSize) return self._runSearchRequest( request, "individuals", protocol.SearchIndividualsResponse)
def getExperiment(self): """ Returns the GA4GH protocol representation of this read group's Experiment. """ experiment = protocol.Experiment() experiment.id = self.getExperimentId() experiment.instrument_model = pb.string(self.getInstrumentModel()) experiment.sequencing_center = pb.string(self.getSequencingCenter()) experiment.description = pb.string(self.getExperimentDescription()) experiment.library = pb.string(self.getLibrary()) experiment.platform_unit = pb.string(self.getPlatformUnit()) experiment.message_create_time = self._iso8601 experiment.message_update_time = self._iso8601 experiment.run_time = pb.string(self.getRunTime()) return experiment
def getGaTermByName(self, name): """ Returns a GA4GH OntologyTerm object by name. :param name: name of the ontology term, ex. "gene". :return: GA4GH OntologyTerm object. """ # TODO what is the correct value when we have no mapping?? termIds = self.getTermIds(name) if len(termIds) == 0: termId = "" # TODO add logging for missed term translation. else: # TODO what is the correct behaviour here when we have multiple # IDs matching a given name? termId = termIds[0] term = protocol.OntologyTerm() term.term = name term.id = termId term.source_name = self._sourceName term.source_version = pb.string(self._sourceVersion) return term
def toProtocolElement(self): dataset = protocol.Dataset() dataset.id = self.getId() dataset.name = pb.string(self.getLocalId()) dataset.description = pb.string(self.getDescription()) return dataset