def _update_db_md5(self, raw_data, name, source): """ :arg str raw_data: :arg unicode name: :arg unicode source: :returns: filename :rtype: unicode """ # TODO: Documentation. try: reference = Reference.query.filter_by(accession=name).one() current_md5sum = reference.checksum except NoResultFound: current_md5sum = None if current_md5sum: md5sum = self._calculate_hash(raw_data) if md5sum != current_md5sum: self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( name, current_md5sum, md5sum)) Reference.query.filter_by(accession=name).update( {'checksum': md5sum}) session.commit() else: reference = Reference(name, self._calculate_hash(raw_data), source) session.add(reference) session.commit() return self._name_to_file(name)
def references(request, settings, db, available_references): try: keys = request.param except AttributeError: return [] references = [] for key in keys: entry = available_references[key] try: accession = entry['accession'] except KeyError: accession = key geninfo_id = entry.get('geninfo_id') # TODO: use pytest basepath or something? path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', entry['filename']) shutil.copy(path, settings.CACHE_DIR) references.append( Reference(accession, entry['checksum'], geninfo_identifier=geninfo_id)) _add_links(settings, entry.get('links', [])) db.session.add_all(references) db.session.commit() return references
def uploadrecord(self, raw_data): """ Write an uploaded record to a file. If the downloaded file is recognised by its hash, the old UD number is used. :arg str raw_data: A GenBank record. :returns: Accession number for the uploaded file. :rtype: unicode """ md5sum = self._calculate_hash(raw_data) try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: ud = self._new_ud() if self.write(raw_data, ud, 0): reference = Reference(ud, md5sum, 'upload') session.add(reference) session.commit() return ud else: if os.path.isfile(self._name_to_file(reference.accession)): return reference.accession else: return (self.write(raw_data, reference.accession, 0) and reference.accession)
def sync_with_remote(self, remote_wsdl, url_template, days=DEFAULT_CREATED_SINCE_DAYS): """ Synchronize the local cache with the remote cache. :: >>> wsdl = 'https://mutalyzer.nl/mutalyzer/services/?wsdl' >>> template = 'https://mutalyzer.nl/mutalyzer/Reference/{file}' >>> self.sync_with_remote(wsdl, template) (14, 3) :arg remote_wsdl: The url of the remote SOAP WSDL description. :type remote_wsdl: unicode :arg url_template: Formatting string containing a ``{file}`` occurence, see example usage above. :string url_template: unicode :arg days: Only remote entries added this number of days ago or later are considered. :type days: int :return: The number of entries added to the local cache and the number cache files downloaded from the remote site. :rtype: tuple(int, int) """ self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync') created_since = datetime.today() - timedelta(days=days) remote_cache = self.remote_cache(remote_wsdl, created_since) inserted = downloaded = 0 for entry in remote_cache: try: reference = Reference.query.filter_by(accession=entry['name']).one() if reference.checksum is not None: continue except NoResultFound: pass if Reference.query.filter_by(checksum=entry['hash']).count() > 0: continue reference = Reference(entry['name'], entry['hash'], entry['source'], source_data=entry['source_data']) session.add(reference) session.commit() inserted += 1 if entry['source'] == 'upload' and entry['cached']: url = url_template.format(file=entry['cached']) self.store_remote_file(entry['name'], url) downloaded += 1 self._output.addMessage(__file__, -1, 'INFO', 'Inserted %d entries in the cache,' ' downloaded %d files.' \ % (inserted, downloaded)) self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync') return inserted, downloaded
def uploadrecord(self, raw_data) : """ Write an uploaded record to a file. If the downloaded file is recognised by its hash, the old UD number is used. @arg raw_data: A GenBank record. @type raw_data: byte string @return: Accession number for the uploaded file. @rtype: unicode """ md5sum = self._calcHash(raw_data) try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: UD = self._newUD() if self.write(raw_data, UD, 0): reference = Reference(UD, md5sum) session.add(reference) session.commit() return UD else: if os.path.isfile(self._nametofile(reference.accession)): return reference.accession else: return self.write(raw_data, reference.accession, 0) and reference.accession
def downloadrecord(self, url): """ Download a GenBank record from a URL. If the downloaded file is recognised by its hash, the old UD number is used. :arg unicode url: Location of a GenBank record. :returns: UD or None. :rtype: unicode """ if not (url.startswith('http://') or url.startswith('https://') or url.startswith('ftp://')): self._output.addMessage( __file__, 4, 'ERECPARSE', 'Only HTTP(S) or FTP locations are allowed.') return None handle = urllib2.urlopen(url) info = handle.info() if info.gettype() == 'text/plain': length = int(info['Content-Length']) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() md5sum = self._calculate_hash(raw_data) ud = None try: reference = Reference.query.filter_by( checksum=md5sum).one() except NoResultFound: ud = self._new_ud() if not os.path.isfile(self._name_to_file(ud)): ud = self.write(raw_data, ud, 0) and ud if ud: # Parsing went OK, add to DB. reference = Reference(ud, md5sum, source='url', source_data=url) session.add(reference) session.commit() else: if (os.path.isfile(self._name_to_file(reference.accession)) or self.write(raw_data, reference.accession, 0)): ud = reference.accession # Returns the UD or None. return ud else: self._output.addMessage( __file__, 4, 'EFILESIZE', 'Filesize is not within the allowed boundaries.') return None else: self._output.addMessage(__file__, 4, 'ERECPARSE', 'This is not a GenBank record.') return None
def downloadrecord(self, url) : """ Download a GenBank record from a URL. If the downloaded file is recognised by its hash, the old UD number is used. @arg url: Location of a GenBank record @type url: unicode @return: UD or None @rtype: unicode """ if not (url.startswith('http://') or url.startswith('https://') or url.startswith('ftp://')): self._output.addMessage(__file__, 4, "ERECPARSE", "Only HTTP(S) or FTP locations are allowed.") return None handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "text/plain" : length = int(info["Content-Length"]) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() md5sum = self._calcHash(raw_data) UD = None try: reference = Reference.query.filter_by(checksum=md5sum).one() except NoResultFound: UD = self._newUD() if not os.path.isfile(self._nametofile(UD)): UD = self.write(raw_data, UD, 0) and UD if UD: #Parsing went OK, add to DB reference = Reference(UD, md5sum, download_url=url) session.add(reference) session.commit() else: if not os.path.isfile(self._nametofile(reference.accession)): UD = self.write(raw_data, reference.accession, 0) and reference.accession return UD #Returns the UD or None #if else : self._output.addMessage(__file__, 4, "EFILESIZE", "Filesize is not within the allowed boundaries.") return None #else #if else : self._output.addMessage(__file__, 4, "ERECPARSE", "This is not a GenBank record.") return None
def cache_with_references(): for reference in references: entry = REFERENCES[reference] try: accession = entry['accession'] except KeyError: accession = reference geninfo_id = entry.get('geninfo_id') path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', entry['filename']) shutil.copy(path, settings.CACHE_DIR) session.add( Reference(accession, entry['checksum'], geninfo_identifier=geninfo_id)) for transcript, protein in entry.get('links', []): session.add(TranscriptProteinLink(transcript, protein)) session.commit()
def _updateDBmd5(self, raw_data, name, GI): #TODO documentation """ @todo: documentation @arg raw_data: @type raw_data: @arg name: @type name: @arg GI: @type GI: @return: filename @rtype: unicode """ try: reference = Reference.query.filter_by(accession=name).one() currentmd5sum = reference.checksum except NoResultFound: currentmd5sum = None if currentmd5sum : md5sum = self._calcHash(raw_data) if md5sum != currentmd5sum : self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( name, currentmd5sum, md5sum)) Reference.query.filter_by(accession=name).update({'checksum': md5sum}) session.commit() #if else : reference = Reference(name, self._calcHash(raw_data), geninfo_identifier=GI) session.add(reference) session.commit() return self._nametofile(name)
def downloadrecord(self, url, name=None): """ Download an LRG record from an URL. :arg unicode url: Location of the LRG record. :returns: The full path to the file or Nonein case of failure. :rtype: unicode """ lrg_id = name or os.path.splitext(os.path.split(url)[1])[0] # if not lrg_id.startswith('LRG'): # return None filename = self._name_to_file(lrg_id) # TODO: Properly read the file contents to a unicode string and write # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if (info['Content-Type'] == 'application/xml' and 'Content-length' in info): # Looks like a valid LRG file. length = int(info['Content-Length']) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() handle.close() # Do an md5 check. md5sum = self._calculate_hash(raw_data) try: reference = Reference.query.filter_by( accession=lrg_id).one() md5_db = reference.checksum except NoResultFound: md5_db = None if md5_db is None: # Note: The abstraction seems a bit off here, but we # prefer to set `Reference.source` to `lrg` and not to # `url`, since the former is more specific. reference = Reference(lrg_id, md5sum, 'lrg') session.add(reference) session.commit() elif md5sum != md5_db: # Hash has changed for the LRG ID. self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( lrg_id, md5_db, md5sum)) Reference.query.filter_by(accession=lrg_id).update( {'checksum': md5sum}) session.commit() else: # Hash the same as in db. pass if not os.path.isfile(filename): return self.write(raw_data, lrg_id) else: # This can only occur if synchronus calls to mutalyzer are # made to recover a file that did not exist. Still leaves # a window in between the check and the write. return filename else: self._output.addMessage( __file__, 4, 'EFILESIZE', 'Filesize is not within the allowed boundaries.') else: self._output.addMessage(__file__, 4, 'ERECPARSE', 'This is not an LRG record.') handle.close()
def retrieveslice(self, accno, start, stop, orientation): """ Retrieve a slice of a chromosome. If the arguments are recognised (found in the internal database), we look if the associated file is still present and if so: return its UD number. If the arguments are recognised but no file was found, we download the new slice and update the hash (and log if the hash changes). If the arguments are not recognised, we download the new slice and make a new UD number. The content of the slice is placed in the cache with the UD number as filename. :arg unicode accno: The accession number of the chromosome. :arg int start: Start position of the slice (one-based, inclusive, in reference orientation). :arg int stop: End position of the slice (one-based, inclusive, in reference orientation). :arg int orientation: Orientation of the slice: - 1 ; Forward. - 2 ; Reverse complement. :returns: An UD number. :rtype: unicode """ # Not a valid slice. if start > stop: self._output.addMessage( __file__, 4, 'ERETR', 'Could not retrieve slice for start ' 'position greater than stop position.') return None # The slice can not be too big. if stop - start + 1 > settings.MAX_FILE_SIZE: self._output.addMessage( __file__, 4, 'ERETR', 'Could not retrieve slice (request ' 'exceeds maximum of %d bases)' % settings.MAX_FILE_SIZE) return None # Value of the Reference.source_data field for this slice. source_data = '{}:{}:{}:{}'.format(accno, start, stop, ['forward', 'reverse'][orientation - 1]) # Check whether we have seen this slice before. reference = Reference.query.filter_by(source='ncbi_slice', source_data=source_data).first() if reference and os.path.isfile(self._name_to_file( reference.accession)): # It's still present. return reference.accession # It's not present, so download it. try: # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and # in reference orientation. handle = Entrez.efetch(db='nuccore', rettype='gbwithparts', retmode='text', id=accno, seq_start=start, seq_stop=stop, strand=orientation) raw_data = handle.read() handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage( __file__, -1, 'INFO', 'Error connecting to Entrez nuccore database: {}'.format( unicode(e))) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None # Calculate the hash of the downloaded file. md5sum = self._calculate_hash(raw_data) if reference is not None: # We have seen this one before. current_md5sum = reference.checksum if md5sum != current_md5sum: self._output.addMessage( __file__, -1, 'WHASH', 'Warning: Hash of {} changed from {} to {}.'.format( reference.accession, current_md5sum, md5sum)) Reference.query.filter_by( accession=reference.accession).update({'checksum': md5sum}) session.commit() else: # We haven't seen it before, so give it a name. ud = self._new_ud() reference = Reference(ud, md5sum, source='ncbi_slice', source_data=source_data) session.add(reference) session.commit() if self.write(raw_data, reference.accession, 0): return reference.accession
def downloadrecord(self, url, name = None) : """ Download an LRG record from an URL. @arg url: Location of the LRG record @type url: unicode @return: - filename ; The full path to the file - None ; in case of failure @rtype: unicode """ lrgID = name or os.path.splitext(os.path.split(url)[1])[0] #if not lrgID.startswith("LRG"): # return None filename = self._nametofile(lrgID) # Todo: Properly read the file contents to a unicode string and write # it utf-8 encoded. handle = urllib2.urlopen(url) info = handle.info() if info["Content-Type"] == "application/xml" and info.has_key("Content-length"): length = int(info["Content-Length"]) if 512 < length < settings.MAX_FILE_SIZE: raw_data = handle.read() handle.close() #Do an md5 check md5sum = self._calcHash(raw_data) try: reference = Reference.query.filter_by(accession=lrgID).one() md5db = reference.checksum except NoResultFound: md5db = None if md5db is None: reference = Reference(lrgID, md5sum, download_url=url) session.add(reference) session.commit() elif md5sum != md5db: #hash has changed for the LRG ID self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( lrgID, md5db, md5sum)) Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum}) session.commit() else: #hash the same as in db pass if not os.path.isfile(filename) : return self.write(raw_data, lrgID) else: # This can only occur if synchronus calls to mutalyzer are # made to recover a file that did not exist. Still leaves # a window in between the check and the write. return filename #if else : self._output.addMessage(__file__, 4, "EFILESIZE", "Filesize is not within the allowed boundaries.") #if else : self._output.addMessage(__file__, 4, "ERECPARSE", "This is not an LRG record.") handle.close()
def retrieveslice(self, accno, start, stop, orientation) : """ Retrieve a slice of a chromosome. If the arguments are recognised (found in the internal database), we look if the associated file is still present and if so: return its UD number. If the arguments are recognised but no file was found, we download the new slice and update the hash (and log if the hash changes). If the arguments are not recognised, we download the new slice and make a new UD number. The content of the slice is placed in the cache with the UD number as filename. @arg accno: The accession number of the chromosome @type accno: unicode @arg start: Start position of the slice @type start: integer @arg stop: End position of the slice. @type stop: integer @arg orientation: Orientation of the slice: - 1 ; Forward - 2 ; Reverse complement @type orientation: integer @return: An UD number @rtype: unicode """ # Not a valid slice. if start >= stop : return None # The slice can not be too big. if stop - start > settings.MAX_FILE_SIZE: return None slice_orientation = ['forward', 'reverse'][orientation - 1] # Check whether we have seen this slice before. try: reference = Reference.query.filter_by( slice_accession=accno, slice_start=start, slice_stop=stop, slice_orientation=slice_orientation).one() except NoResultFound: reference = None else: if os.path.isfile(self._nametofile(reference.accession)) : # It's still present. return reference.accession # It's not present, so download it. try: handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text', id=accno, seq_start=start, seq_stop=stop, strand=orientation) raw_data = handle.read() handle.close() except (IOError, urllib2.HTTPError, HTTPException) as e: self._output.addMessage(__file__, -1, 'INFO', 'Error connecting to Entrez nuccore database: %s' % unicode(e)) self._output.addMessage(__file__, 4, 'ERETR', 'Could not retrieve slice.') return None # Calculate the hash of the downloaded file. md5sum = self._calcHash(raw_data) if reference is not None: # We have seen this one before. currentmd5sum = reference.checksum if md5sum != currentmd5sum : self._output.addMessage(__file__, -1, "WHASH", "Warning: Hash of %s changed from %s to %s." % ( reference.accession, currentmd5sum, md5sum)) Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum}) session.commit() #if else : # We haven't seen it before, so give it a name. UD = self._newUD() slice_orientation = ['forward', 'reverse'][orientation - 1] reference = Reference(UD, md5sum, slice_accession=accno, slice_start=start, slice_stop=stop, slice_orientation=slice_orientation) session.add(reference) session.commit() #else if self.write(raw_data, reference.accession, 0): return reference.accession