Example #1
0
    def _update_db_md5(self, raw_data, name, source):
        """
        :arg str raw_data:
        :arg unicode name:
        :arg unicode source:

        :returns: filename
        :rtype: unicode
        """
        # TODO: Documentation.
        try:
            reference = Reference.query.filter_by(accession=name).one()
            current_md5sum = reference.checksum
        except NoResultFound:
            current_md5sum = None

        if current_md5sum:
            md5sum = self._calculate_hash(raw_data)
            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        name, current_md5sum, md5sum))
                Reference.query.filter_by(accession=name).update(
                    {'checksum': md5sum})
                session.commit()
        else:
            reference = Reference(name, self._calculate_hash(raw_data), source)
            session.add(reference)
            session.commit()
        return self._name_to_file(name)
Example #2
0
def references(request, settings, db, available_references):
    try:
        keys = request.param
    except AttributeError:
        return []

    references = []

    for key in keys:
        entry = available_references[key]
        try:
            accession = entry['accession']
        except KeyError:
            accession = key
        geninfo_id = entry.get('geninfo_id')

        # TODO: use pytest basepath or something?
        path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            'data', entry['filename'])
        shutil.copy(path, settings.CACHE_DIR)

        references.append(
            Reference(accession,
                      entry['checksum'],
                      geninfo_identifier=geninfo_id))

        _add_links(settings, entry.get('links', []))

    db.session.add_all(references)
    db.session.commit()

    return references
Example #3
0
    def uploadrecord(self, raw_data):
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg str raw_data: A GenBank record.

        :returns: Accession number for the uploaded file.
        :rtype: unicode
        """
        md5sum = self._calculate_hash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            ud = self._new_ud()
            if self.write(raw_data, ud, 0):
                reference = Reference(ud, md5sum, 'upload')
                session.add(reference)
                session.commit()
                return ud
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                return reference.accession
            else:
                return (self.write(raw_data, reference.accession, 0)
                        and reference.accession)
Example #4
0
    def sync_with_remote(self, remote_wsdl, url_template,
                         days=DEFAULT_CREATED_SINCE_DAYS):
        """
        Synchronize the local cache with the remote cache.

        ::

            >>> wsdl = 'https://mutalyzer.nl/mutalyzer/services/?wsdl'
            >>> template = 'https://mutalyzer.nl/mutalyzer/Reference/{file}'
            >>> self.sync_with_remote(wsdl, template)
            (14, 3)

        :arg remote_wsdl: The url of the remote SOAP WSDL description.
        :type remote_wsdl: unicode
        :arg url_template: Formatting string containing a ``{file}``
          occurence, see example usage above.
        :string url_template: unicode
        :arg days: Only remote entries added this number of days ago or
          later are considered.
        :type days: int

        :return: The number of entries added to the local cache and the number
          cache files downloaded from the remote site.
        :rtype: tuple(int, int)
        """
        self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync')

        created_since = datetime.today() - timedelta(days=days)
        remote_cache = self.remote_cache(remote_wsdl, created_since)

        inserted = downloaded = 0

        for entry in remote_cache:
            try:
                reference = Reference.query.filter_by(accession=entry['name']).one()
                if reference.checksum is not None:
                    continue
            except NoResultFound:
                pass

            if Reference.query.filter_by(checksum=entry['hash']).count() > 0:
                continue

            reference = Reference(entry['name'], entry['hash'], entry['source'],
                                  source_data=entry['source_data'])
            session.add(reference)
            session.commit()
            inserted += 1
            if entry['source'] == 'upload' and entry['cached']:
                url = url_template.format(file=entry['cached'])
                self.store_remote_file(entry['name'], url)
                downloaded += 1

        self._output.addMessage(__file__, -1, 'INFO',
                                'Inserted %d entries in the cache,'
                                ' downloaded %d files.' \
                                % (inserted, downloaded))
        self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync')

        return inserted, downloaded
Example #5
0
    def uploadrecord(self, raw_data) :
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg raw_data: A GenBank record.
        @type raw_data: byte string

        @return: Accession number for the uploaded file.
        @rtype: unicode
        """
        md5sum = self._calcHash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            UD = self._newUD()
            if self.write(raw_data, UD, 0):
                reference = Reference(UD, md5sum)
                session.add(reference)
                session.commit()
                return UD
        else:
            if os.path.isfile(self._nametofile(reference.accession)):
                return reference.accession
            else:
                return self.write(raw_data, reference.accession, 0) and reference.accession
Example #6
0
    def downloadrecord(self, url):
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg unicode url: Location of a GenBank record.

        :returns: UD or None.
        :rtype: unicode
        """
        if not (url.startswith('http://') or url.startswith('https://')
                or url.startswith('ftp://')):
            self._output.addMessage(
                __file__, 4, 'ERECPARSE',
                'Only HTTP(S) or FTP locations are allowed.')
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info.gettype() == 'text/plain':
            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calculate_hash(raw_data)

                ud = None
                try:
                    reference = Reference.query.filter_by(
                        checksum=md5sum).one()
                except NoResultFound:
                    ud = self._new_ud()
                    if not os.path.isfile(self._name_to_file(ud)):
                        ud = self.write(raw_data, ud, 0) and ud
                    if ud:
                        # Parsing went OK, add to DB.
                        reference = Reference(ud,
                                              md5sum,
                                              source='url',
                                              source_data=url)
                        session.add(reference)
                        session.commit()
                else:
                    if (os.path.isfile(self._name_to_file(reference.accession))
                            or self.write(raw_data, reference.accession, 0)):
                        ud = reference.accession

                # Returns the UD or None.
                return ud
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
                return None
        else:
            self._output.addMessage(__file__, 4, 'ERECPARSE',
                                    'This is not a GenBank record.')
            return None
Example #7
0
    def downloadrecord(self, url) :
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg url: Location of a GenBank record
        @type url: unicode

        @return: UD or None
        @rtype: unicode
        """
        if not (url.startswith('http://') or
                url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                    "Only HTTP(S) or FTP locations are allowed.")
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calcHash(raw_data)

                UD = None

                try:
                    reference = Reference.query.filter_by(checksum=md5sum).one()
                except NoResultFound:
                    UD = self._newUD()
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                    if UD:      #Parsing went OK, add to DB
                        reference = Reference(UD, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(self._nametofile(reference.accession)):
                        UD = self.write(raw_data, reference.accession, 0) and reference.accession

                return UD #Returns the UD or None
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
                return None
            #else
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not a GenBank record.")
            return None
Example #8
0
    def cache_with_references():
        for reference in references:
            entry = REFERENCES[reference]
            try:
                accession = entry['accession']
            except KeyError:
                accession = reference
            geninfo_id = entry.get('geninfo_id')

            path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'data', entry['filename'])
            shutil.copy(path, settings.CACHE_DIR)

            session.add(
                Reference(accession,
                          entry['checksum'],
                          geninfo_identifier=geninfo_id))

            for transcript, protein in entry.get('links', []):
                session.add(TranscriptProteinLink(transcript, protein))

        session.commit()
Example #9
0
    def _updateDBmd5(self, raw_data, name, GI):
        #TODO documentation
        """
        @todo: documentation

        @arg raw_data:
        @type raw_data:
        @arg name:
        @type name:
        @arg GI:
        @type GI:

        @return: filename
        @rtype: unicode
        """
        try:
            reference = Reference.query.filter_by(accession=name).one()
            currentmd5sum = reference.checksum
        except NoResultFound:
            currentmd5sum = None

        if currentmd5sum :
            md5sum = self._calcHash(raw_data)
            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    name, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=name).update({'checksum': md5sum})
                session.commit()
            #if
        else :
            reference = Reference(name, self._calcHash(raw_data),
                                  geninfo_identifier=GI)
            session.add(reference)
            session.commit()
        return self._nametofile(name)
Example #10
0
    def downloadrecord(self, url, name=None):
        """
        Download an LRG record from an URL.

        :arg unicode url: Location of the LRG record.

        :returns: The full path to the file or Nonein case of failure.
        :rtype: unicode
        """
        lrg_id = name or os.path.splitext(os.path.split(url)[1])[0]
        # if not lrg_id.startswith('LRG'):
        #     return None
        filename = self._name_to_file(lrg_id)

        # TODO: Properly read the file contents to a unicode string and write
        # it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()

        if (info['Content-Type'] == 'application/xml'
                and 'Content-length' in info):
            # Looks like a valid LRG file.

            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                # Do an md5 check.
                md5sum = self._calculate_hash(raw_data)
                try:
                    reference = Reference.query.filter_by(
                        accession=lrg_id).one()
                    md5_db = reference.checksum
                except NoResultFound:
                    md5_db = None

                if md5_db is None:
                    # Note: The abstraction seems a bit off here, but we
                    # prefer to set `Reference.source` to `lrg` and not to
                    # `url`, since the former is more specific.
                    reference = Reference(lrg_id, md5sum, 'lrg')
                    session.add(reference)
                    session.commit()
                elif md5sum != md5_db:
                    # Hash has changed for the LRG ID.
                    self._output.addMessage(
                        __file__, -1, 'WHASH',
                        'Warning: Hash of {} changed from {} to {}.'.format(
                            lrg_id, md5_db, md5sum))
                    Reference.query.filter_by(accession=lrg_id).update(
                        {'checksum': md5sum})
                    session.commit()
                else:
                    # Hash the same as in db.
                    pass

                if not os.path.isfile(filename):
                    return self.write(raw_data, lrg_id)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
        else:
            self._output.addMessage(__file__, 4, 'ERECPARSE',
                                    'This is not an LRG record.')
        handle.close()
Example #11
0
    def retrieveslice(self, accno, start, stop, orientation):
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        :arg unicode accno: The accession number of the chromosome.
        :arg int start: Start position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int stop: End position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int orientation: Orientation of the slice:
            - 1 ; Forward.
            - 2 ; Reverse complement.

        :returns: An UD number.
        :rtype: unicode
        """
        # Not a valid slice.
        if start > stop:
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice for start '
                'position greater than stop position.')
            return None

        # The slice can not be too big.
        if stop - start + 1 > settings.MAX_FILE_SIZE:
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice (request '
                'exceeds maximum of %d bases)' % settings.MAX_FILE_SIZE)
            return None

        # Value of the Reference.source_data field for this slice.
        source_data = '{}:{}:{}:{}'.format(accno, start, stop,
                                           ['forward',
                                            'reverse'][orientation - 1])

        # Check whether we have seen this slice before.
        reference = Reference.query.filter_by(source='ncbi_slice',
                                              source_data=source_data).first()
        if reference and os.path.isfile(self._name_to_file(
                reference.accession)):
            # It's still present.
            return reference.accession

        # It's not present, so download it.
        try:
            # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
            # in reference orientation.
            handle = Entrez.efetch(db='nuccore',
                                   rettype='gbwithparts',
                                   retmode='text',
                                   id=accno,
                                   seq_start=start,
                                   seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(
                __file__, -1, 'INFO',
                'Error connecting to Entrez nuccore database: {}'.format(
                    unicode(e)))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calculate_hash(raw_data)

        if reference is not None:
            # We have seen this one before.
            current_md5sum = reference.checksum

            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        reference.accession, current_md5sum, md5sum))
                Reference.query.filter_by(
                    accession=reference.accession).update({'checksum': md5sum})
                session.commit()
        else:
            # We haven't seen it before, so give it a name.
            ud = self._new_ud()
            reference = Reference(ud,
                                  md5sum,
                                  source='ncbi_slice',
                                  source_data=source_data)
            session.add(reference)
            session.commit()

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Example #12
0
    def downloadrecord(self, url, name = None) :
        """
        Download an LRG record from an URL.

        @arg url: Location of the LRG record
        @type url: unicode

        @return:
            - filename    ; The full path to the file
            - None        ; in case of failure
        @rtype: unicode
        """

        lrgID = name or os.path.splitext(os.path.split(url)[1])[0]
        #if not lrgID.startswith("LRG"):
        #    return None
        filename = self._nametofile(lrgID)

        # Todo: Properly read the file contents to a unicode string and write
        #   it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "application/xml" and info.has_key("Content-length"):

            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                #Do an md5 check
                md5sum = self._calcHash(raw_data)
                try:
                    reference = Reference.query.filter_by(accession=lrgID).one()
                    md5db = reference.checksum
                except NoResultFound:
                    md5db = None

                if md5db is None:
                    reference = Reference(lrgID, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5db:       #hash has changed for the LRG ID
                    self._output.addMessage(__file__, -1, "WHASH",
                        "Warning: Hash of %s changed from %s to %s." % (
                        lrgID, md5db, md5sum))
                    Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum})
                    session.commit()
                else:                       #hash the same as in db
                    pass

                if not os.path.isfile(filename) :
                    return self.write(raw_data, lrgID)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not an LRG record.")
        handle.close()
Example #13
0
    def retrieveslice(self, accno, start, stop, orientation) :
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        @arg accno: The accession number of the chromosome
        @type accno: unicode
        @arg start: Start position of the slice
        @type start: integer
        @arg stop: End position of the slice.
        @type stop: integer
        @arg orientation:
        Orientation of the slice:
            - 1 ; Forward
            - 2 ; Reverse complement
        @type orientation: integer

        @return: An UD number
        @rtype: unicode
        """

        # Not a valid slice.
        if start >= stop :
            return None

        # The slice can not be too big.
        if stop - start > settings.MAX_FILE_SIZE:
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._nametofile(reference.accession)) : # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text',
                                   id=accno, seq_start=start, seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Error connecting to Entrez nuccore database: %s' % unicode(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calcHash(raw_data)

        if reference is not None: # We have seen this one before.
            currentmd5sum = reference.checksum

            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    reference.accession, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum})
                session.commit()
            #if
        else : # We haven't seen it before, so give it a name.
            UD = self._newUD()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(UD, md5sum, slice_accession=accno,
                                  slice_start=start, slice_stop=stop,
                                  slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()
        #else

        if self.write(raw_data, reference.accession, 0):
            return reference.accession