def setup(self): super(TestMutator, self).setup() self.gb_parser = genbank.GBparser()
def loadrecord(self, accession): """ Load a RefSeq record and return it. The record is found by trying the following options in order: 1. Returned from the cache if it is there. 2. Re-created (if it was created by slicing) or re-downloaded (if it was created by URL) if we have information on its source in the database. 3. Fetched from the NCBI. :arg unicode accession: A RefSeq accession number. :returns: A parsed RefSeq record or `None` if no record could be found for the given accession. :rtype: object """ reference = Reference.query.filter_by(accession=accession).first() if reference is None: # We don't know it, fetch it from NCBI. filename = self.fetch(accession) else: # We have seen it before. filename = self._name_to_file(reference.accession) if os.path.isfile(filename): # It is still in the cache, so filename is valid. pass elif reference.source == 'ncbi_slice': # It was previously created by slicing. cast_orientation = {None: None, 'forward': 1, 'reverse': 2} (slice_accession, slice_start, slice_stop, slice_orientation) = reference.source_data.split(':') slice_start = int(slice_start) slice_stop = int(slice_stop) slice_orientation = cast_orientation[slice_orientation] if not self.retrieveslice(slice_accession, slice_start, slice_stop, slice_orientation): filename = None elif reference.source == 'url': # It was previously created by URL. if not self.downloadrecord(reference.source_data): filename = None elif reference.source == 'ncbi': # It was previously fetched from NCBI. filename = self.fetch(reference.accession) else: # It was previously created by uploading. self._output.addMessage(__file__, 4, 'ERETR', 'Please upload this sequence again.') filename = None # If filename is None, we could not retrieve the record. if filename is None: # Notify batch job to skip all instance of identifier. self._output.addOutput('BatchFlags', ('S1', accession)) return None # Now we have the file, so we can parse it. genbank_parser = genbank.GBparser() record = genbank_parser.create_record(filename) if reference: record.id = reference.accession else: record.id = record.source_id # Todo: This will change once we support protein references. if isinstance(record.seq.alphabet, ProteinAlphabet): self._output.addMessage( __file__, 4, 'ENOTIMPLEMENTED', 'Protein reference sequences are not supported.') return None return record
def loadrecord(self, identifier): """ Load a RefSeq record and return it. The record is found by trying the following options in order: 1. Returned from the cache if it is there. 2. Re-created (if it was created by slicing) or re-downloaded (if it was created by URL) if we have information on its source in the database. 3. Fetched from the NCBI. :arg identifier: A RefSeq accession number or geninfo identifier (GI). :type identifier: unicode :return: A parsed RefSeq record or `None` if no record could be found for the given identifier. :rtype: mutalyzer.GenRecord.Record """ if identifier[0].isdigit(): # This is a GI number (geninfo identifier). reference = Reference.query \ .filter_by(geninfo_identifier=identifier) \ .first() else: # This is a RefSeq accession number. reference = Reference.query \ .filter_by(accession=identifier) \ .first() if reference is None: # We don't know it, fetch it from NCBI. filename = self.fetch(identifier) else: # We have seen it before. filename = self._nametofile(reference.accession) if os.path.isfile(filename): # It is still in the cache, so filename is valid. pass elif reference.slice_accession: # It was previously created by slicing. cast_orientation = {None: None, 'forward': 1, 'reverse': 2} if not self.retrieveslice(reference.slice_accession, reference.slice_start, reference.slice_stop, cast_orientation[reference.slice_orientation]): filename = None elif reference.download_url: # It was previously created by URL. if not self.downloadrecord(reference.download_url): filename = None elif reference.geninfo_identifier: # It was previously fetched from NCBI. filename = self.fetch(reference.accession) else: # It was previously created by uploading. self._output.addMessage(__file__, 4, 'ERETR', 'Please upload this sequence again.') filename = None # If filename is None, we could not retrieve the record. if filename is None: # Notify batch job to skip all instance of identifier. self._output.addOutput('BatchFlags', ('S1', identifier)) return None # Now we have the file, so we can parse it. GenBankParser = genbank.GBparser() record = GenBankParser.create_record(filename) if reference: record.id = reference.accession else: record.id = record.source_id # Todo: This will change once we support protein references. if isinstance(record.seq.alphabet, ProteinAlphabet): self._output.addMessage( __file__, 4, 'ENOTIMPLEMENTED', 'Protein reference sequences are not supported.') return None return record