Ejemplo n.º 1
0
    def uploadrecord(self, raw_data) :
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg raw_data: A GenBank record.
        @type raw_data: byte string

        @return: Accession number for the uploaded file.
        @rtype: unicode
        """
        md5sum = self._calcHash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            UD = self._newUD()
            if self.write(raw_data, UD, 0):
                reference = Reference(UD, md5sum)
                session.add(reference)
                session.commit()
                return UD
        else:
            if os.path.isfile(self._nametofile(reference.accession)):
                return reference.accession
            else:
                return self.write(raw_data, reference.accession, 0) and reference.accession
Ejemplo n.º 2
0
def pop_batch_queue_item(batch_job):
    """
    Get the next batch queue item for the given batch job. Return its fields
    as a tuple `item`, `flags` and remove it from the database.

    If no batch queue item could be found for this batch job, return `None`.

    .. note:: Originally, finding the next batch queue item was done using a
        more complicated query::

            SELECT QueueID, Input, Flags
            FROM BatchQueue
            WHERE QueueID = (
                SELECT MIN(QueueID)
                FROM BatchQueue
                GROUP BY JobID
                HAVING JobID = {batch_job.id}
            );

        However, I couldn't see any significant performance difference in my
        latest benchmarks, so we stick with the more obvious query for now.
    """
    batch_queue_item = BatchQueueItem.query \
        .filter_by(batch_job=batch_job) \
        .order_by(BatchQueueItem.id.asc()) \
        .first()
    if batch_queue_item is None:
        return None

    item, flags = batch_queue_item.item, batch_queue_item.flags

    session.delete(batch_queue_item)
    session.commit()

    return item, flags
Ejemplo n.º 3
0
def pop_batch_queue_item(batch_job):
    """
    Get the next batch queue item for the given batch job. Return its fields
    as a tuple `item`, `flags` and remove it from the database.

    If no batch queue item could be found for this batch job, return `None`.

    .. note:: Originally, finding the next batch queue item was done using a
        more complicated query::

            SELECT QueueID, Input, Flags
            FROM BatchQueue
            WHERE QueueID = (
                SELECT MIN(QueueID)
                FROM BatchQueue
                GROUP BY JobID
                HAVING JobID = {batch_job.id}
            );

        However, I couldn't see any significant performance difference in my
        latest benchmarks, so we stick with the more obvious query for now.
    """
    batch_queue_item = BatchQueueItem.query \
        .filter_by(batch_job=batch_job) \
        .order_by(BatchQueueItem.id.asc()) \
        .first()
    if batch_queue_item is None:
        return None

    item, flags = batch_queue_item.item, batch_queue_item.flags

    session.delete(batch_queue_item)
    session.commit()

    return item, flags
Ejemplo n.º 4
0
    def sync_with_remote(self, remote_wsdl, url_template,
                         days=DEFAULT_CREATED_SINCE_DAYS):
        """
        Synchronize the local cache with the remote cache.

        ::

            >>> wsdl = 'https://mutalyzer.nl/mutalyzer/services/?wsdl'
            >>> template = 'https://mutalyzer.nl/mutalyzer/Reference/{file}'
            >>> self.sync_with_remote(wsdl, template)
            (14, 3)

        :arg remote_wsdl: The url of the remote SOAP WSDL description.
        :type remote_wsdl: unicode
        :arg url_template: Formatting string containing a ``{file}``
          occurence, see example usage above.
        :string url_template: unicode
        :arg days: Only remote entries added this number of days ago or
          later are considered.
        :type days: int

        :return: The number of entries added to the local cache and the number
          cache files downloaded from the remote site.
        :rtype: tuple(int, int)
        """
        self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync')

        created_since = datetime.today() - timedelta(days=days)
        remote_cache = self.remote_cache(remote_wsdl, created_since)

        inserted = downloaded = 0

        for entry in remote_cache:
            try:
                reference = Reference.query.filter_by(accession=entry['name']).one()
                if reference.checksum is not None:
                    continue
            except NoResultFound:
                pass

            if Reference.query.filter_by(checksum=entry['hash']).count() > 0:
                continue

            reference = Reference(entry['name'], entry['hash'], entry['source'],
                                  source_data=entry['source_data'])
            session.add(reference)
            session.commit()
            inserted += 1
            if entry['source'] == 'upload' and entry['cached']:
                url = url_template.format(file=entry['cached'])
                self.store_remote_file(entry['name'], url)
                downloaded += 1

        self._output.addMessage(__file__, -1, 'INFO',
                                'Inserted %d entries in the cache,'
                                ' downloaded %d files.' \
                                % (inserted, downloaded))
        self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync')

        return inserted, downloaded
Ejemplo n.º 5
0
    def _update_db_md5(self, raw_data, name, gi):
        """
        :arg str raw_data:
        :arg unicode name:
        :arg unicode gi:

        :returns: filename
        :rtype: unicode
        """
        # TODO: Documentation.
        try:
            reference = Reference.query.filter_by(accession=name).one()
            current_md5sum = reference.checksum
        except NoResultFound:
            current_md5sum = None

        if current_md5sum:
            md5sum = self._calculate_hash(raw_data)
            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        name, current_md5sum, md5sum))
                Reference.query.filter_by(accession=name).update(
                    {'checksum': md5sum})
                session.commit()
        else:
            reference = Reference(
                name, self._calculate_hash(raw_data), geninfo_identifier=gi)
            session.add(reference)
            session.commit()
        return self._name_to_file(name)
Ejemplo n.º 6
0
    def uploadrecord(self, raw_data):
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg str raw_data: A GenBank record.

        :returns: Accession number for the uploaded file.
        :rtype: unicode
        """
        md5sum = self._calculate_hash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            ud = self._new_ud()
            if self.write(raw_data, ud, 0):
                reference = Reference(ud, md5sum)
                session.add(reference)
                session.commit()
                return ud
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                return reference.accession
            else:
                return (self.write(raw_data, reference.accession, 0) and
                        reference.accession)
Ejemplo n.º 7
0
    def uploadrecord(self, raw_data) :
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg raw_data: A GenBank record.
        @type raw_data: byte string

        @return: Accession number for the uploaded file.
        @rtype: unicode
        """
        md5sum = self._calcHash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            UD = self._newUD()
            if self.write(raw_data, UD, 0):
                reference = Reference(UD, md5sum)
                session.add(reference)
                session.commit()
                return UD
        else:
            if os.path.isfile(self._nametofile(reference.accession)):
                return reference.accession
            else:
                return self.write(raw_data, reference.accession, 0) and reference.accession
Ejemplo n.º 8
0
    def uploadrecord(self, raw_data):
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg str raw_data: A GenBank record.

        :returns: Accession number for the uploaded file.
        :rtype: unicode
        """
        md5sum = self._calculate_hash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            ud = self._new_ud()
            if self.write(raw_data, ud, 0):
                reference = Reference(ud, md5sum, 'upload')
                session.add(reference)
                session.commit()
                return ud
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                return reference.accession
            else:
                return (self.write(raw_data, reference.accession, 0)
                        and reference.accession)
Ejemplo n.º 9
0
    def addJob(self,
               email,
               queue,
               columns,
               job_type,
               argument=None,
               create_download_url=None):
        """
        Add a job to the Database and start the BatchChecker.

        @arg email:         e-mail address of batch supplier
        @type email:        unicode
        @arg queue:         A list of jobs
        @type queue:        list
        @arg columns:       The number of columns.
        @type columns:      int
        @arg job_type:       The type of Batch Job that should be run
        @type job_type:
        @arg argument:          Batch Arguments, for now only build info
        @type argument:
        @arg create_download_url: Function accepting a result_id and returning
                                  the URL for downloading the batch job
                                  result. Can be None.
        @type create_download_url: function

        @return: result_id
        @rtype:
        """
        # Add jobs to the database
        batch_job = BatchJob(job_type, email=email, argument=argument)
        if create_download_url:
            batch_job.download_url = create_download_url(batch_job.result_id)
        session.add(batch_job)

        for i, inputl in enumerate(queue):
            # NOTE:
            # This is a very dirty way to skip entries before they are fed
            # to the batch processes. This is needed for e.g. an empty line
            # or because the File Module noticed wrong formatting. These lines
            # used to be discarded but are now preserved by the escape string.
            # The benefit of this is that the users input will match the
            # output in terms of input line and outputline.
            if inputl.startswith("~!"):  #Dirty Escape
                inputl = inputl[2:]
                if inputl:
                    flag = "S0"  # Flag for wrong format
                else:
                    flag = "S9"  # Flag for empty line
                    inputl = " "  #Database doesn't like an empty inputfield
            else:
                flag = None
            if (i + 1) % columns:
                # Add flag for continuing the current row
                flag = '%s%s' % (flag if flag else '', 'C0')

            item = BatchQueueItem(batch_job, inputl, flags=flag)
            session.add(item)

        session.commit()
        return batch_job.result_id
Ejemplo n.º 10
0
    def _update_db_md5(self, raw_data, name, source):
        """
        :arg str raw_data:
        :arg unicode name:
        :arg unicode source:

        :returns: filename
        :rtype: unicode
        """
        # TODO: Documentation.
        try:
            reference = Reference.query.filter_by(accession=name).one()
            current_md5sum = reference.checksum
        except NoResultFound:
            current_md5sum = None

        if current_md5sum:
            md5sum = self._calculate_hash(raw_data)
            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        name, current_md5sum, md5sum))
                Reference.query.filter_by(accession=name).update(
                    {'checksum': md5sum})
                session.commit()
        else:
            reference = Reference(name, self._calculate_hash(raw_data), source)
            session.add(reference)
            session.commit()
        return self._name_to_file(name)
Ejemplo n.º 11
0
    def downloadrecord(self, url):
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg unicode url: Location of a GenBank record.

        :returns: UD or None.
        :rtype: unicode
        """
        if not (url.startswith('http://') or url.startswith('https://')
                or url.startswith('ftp://')):
            self._output.addMessage(
                __file__, 4, 'ERECPARSE',
                'Only HTTP(S) or FTP locations are allowed.')
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info.gettype() == 'text/plain':
            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calculate_hash(raw_data)

                ud = None
                try:
                    reference = Reference.query.filter_by(
                        checksum=md5sum).one()
                except NoResultFound:
                    ud = self._new_ud()
                    if not os.path.isfile(self._name_to_file(ud)):
                        ud = self.write(raw_data, ud, 0) and ud
                    if ud:
                        # Parsing went OK, add to DB.
                        reference = Reference(ud,
                                              md5sum,
                                              source='url',
                                              source_data=url)
                        session.add(reference)
                        session.commit()
                else:
                    if (os.path.isfile(self._name_to_file(reference.accession))
                            or self.write(raw_data, reference.accession, 0)):
                        ud = reference.accession

                # Returns the UD or None.
                return ud
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
                return None
        else:
            self._output.addMessage(__file__, 4, 'ERECPARSE',
                                    'This is not a GenBank record.')
            return None
Ejemplo n.º 12
0
    def downloadrecord(self, url):
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg unicode url: Location of a GenBank record.

        :returns: UD or None.
        :rtype: unicode
        """
        if not (url.startswith('http://') or url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(
                __file__, 4, 'ERECPARSE',
                'Only HTTP(S) or FTP locations are allowed.')
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info['Content-Type'] == 'text/plain':
            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calculate_hash(raw_data)

                ud = None
                try:
                    reference = Reference.query.filter_by(
                        checksum=md5sum).one()
                except NoResultFound:
                    ud = self._new_ud()
                    if not os.path.isfile(self._name_to_file(ud)):
                        ud = self.write(raw_data, ud, 0) and ud
                    if ud:
                        # Parsing went OK, add to DB.
                        reference = Reference(ud, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(
                            self._name_to_file(reference.accession)):
                        ud = (self.write(raw_data, reference.accession, 0) and
                              reference.accession)

                # Returns the UD or None.
                return ud
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
                return None
        else:
            self._output.addMessage(
                __file__, 4, 'ERECPARSE', 'This is not a GenBank record.')
            return None
Ejemplo n.º 13
0
    def downloadrecord(self, url) :
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg url: Location of a GenBank record
        @type url: unicode

        @return: UD or None
        @rtype: unicode
        """
        if not (url.startswith('http://') or
                url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                    "Only HTTP(S) or FTP locations are allowed.")
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calcHash(raw_data)

                UD = None

                try:
                    reference = Reference.query.filter_by(checksum=md5sum).one()
                except NoResultFound:
                    UD = self._newUD()
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                    if UD:      #Parsing went OK, add to DB
                        reference = Reference(UD, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(self._nametofile(reference.accession)):
                        UD = self.write(raw_data, reference.accession, 0) and reference.accession

                return UD #Returns the UD or None
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
                return None
            #else
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not a GenBank record.")
            return None
Ejemplo n.º 14
0
    def __alterBatchEntries(self, jobID, old, new, flag, nselector) :
        """
        Replace within one JobID all entries matching old with new, if they do
        not match the negative selector.

        This is used to alter batch entries that would otherwise take a long
        time to process. E.g. a batch job with a lot of the same accession
        numbers without version numbers would take a long time because
        mutalyzer would fetch the file from the NCBI for each entry. A
        database update over all entries with the same accession number speeds
        up the job considerably.

        Example:
        NM_002001(FCER1A_v001):c.1A>C ; this would result in the continuous
        fetching of the reference because no version number is given.
        In this case the arguments would be:
            - old         ;   NM_002001
            - new         ;   NM_002001.2
            - nselector   ;   NM_002001.

        The nselector is used to prevent the replacement of
        false positives. e.g. NM_002001.1(FCER1A_v001):c.1A>C should not
        be replaced. For this reason, any items starting with the nselector
        value are ignored.

        @arg jobID:
        @type jobID:
        @arg old:
        @type old:
        @arg new:
        @type new:
        @arg flag:
        @type flag:
        @arg nselector:
        @type nselector:
        """
        #query = '''UPDATE batch_queue_items
        #             SET item = REPLACE(item, :old, :new),
        #                 flags = flags || :flag
        #             WHERE batch_job_id = :batch_job_id
        #                   AND NOT item LIKE :nselector%'''
        #parameters = {'batch_job_id': jobID,
        #              'old': old,
        #              'new': new,
        #              'flag': flag,
        #              'nselector': nselector}
        #session.execute(query, parameters)
        BatchQueueItem.query \
            .filter_by(batch_job_id=jobID) \
            .filter(BatchQueueItem.item.startswith(old),
                    ~BatchQueueItem.item.startswith(nselector)) \
            .update({'item': func.replace(BatchQueueItem.item, old, new),
                     'flags': BatchQueueItem.flags + flag},
                    synchronize_session=False)
        session.commit()
Ejemplo n.º 15
0
def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='******',
                                 host='genome-mysql.cse.ucsc.edu',
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
    parameters = gene,

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(chromosome,
                                                     'refseq',
                                                     acc,
                                                     geneName,
                                                     orientation,
                                                     txStart + 1,
                                                     txEnd,
                                                     exon_starts,
                                                     exon_stops,
                                                     'ucsc',
                                                     cds=cds,
                                                     version=int(version))
        session.add(mapping)

    session.commit()
Ejemplo n.º 16
0
    def addJob(self, email, queue, columns, job_type, argument=None,
               create_download_url=None):
        """
        Add a job to the Database and start the BatchChecker.

        @arg email:         e-mail address of batch supplier
        @type email:        unicode
        @arg queue:         A list of jobs
        @type queue:        list
        @arg columns:       The number of columns.
        @type columns:      int
        @arg job_type:       The type of Batch Job that should be run
        @type job_type:
        @arg argument:          Batch Arguments, for now only build info
        @type argument:
        @arg create_download_url: Function accepting a result_id and returning
                                  the URL for downloading the batch job
                                  result. Can be None.
        @type create_download_url: function

        @return: result_id
        @rtype:
        """
        # Add jobs to the database
        batch_job = BatchJob(job_type, email=email, argument=argument)
        if create_download_url:
            batch_job.download_url = create_download_url(batch_job.result_id)
        session.add(batch_job)

        for i, inputl in enumerate(queue):
            # NOTE:
            # This is a very dirty way to skip entries before they are fed
            # to the batch processes. This is needed for e.g. an empty line
            # or because the File Module noticed wrong formatting. These lines
            # used to be discarded but are now preserved by the escape string.
            # The benefit of this is that the users input will match the
            # output in terms of input line and outputline.
            if inputl.startswith("~!"): #Dirty Escape
                inputl = inputl[2:]
                if inputl:
                    flag = "S0"     # Flag for wrong format
                else:
                    flag = "S9"     # Flag for empty line
                    inputl = " " #Database doesn't like an empty inputfield
            else:
                flag = None
            if (i + 1) % columns:
                # Add flag for continuing the current row
                flag = '%s%s' % (flag if flag else '', 'C0')

            item = BatchQueueItem(batch_job, inputl, flags=flag)
            session.add(item)

        session.commit()
        return batch_job.result_id
Ejemplo n.º 17
0
    def __alterBatchEntries(self, jobID, old, new, flag, nselector):
        """
        Replace within one JobID all entries matching old with new, if they do
        not match the negative selector.

        This is used to alter batch entries that would otherwise take a long
        time to process. E.g. a batch job with a lot of the same accession
        numbers without version numbers would take a long time because
        mutalyzer would fetch the file from the NCBI for each entry. A
        database update over all entries with the same accession number speeds
        up the job considerably.

        Example:
        NM_002001(FCER1A_v001):c.1A>C ; this would result in the continuous
        fetching of the reference because no version number is given.
        In this case the arguments would be:
            - old         ;   NM_002001
            - new         ;   NM_002001.2
            - nselector   ;   NM_002001.

        The nselector is used to prevent the replacement of
        false positives. e.g. NM_002001.1(FCER1A_v001):c.1A>C should not
        be replaced. For this reason, any items starting with the nselector
        value are ignored.

        @arg jobID:
        @type jobID:
        @arg old:
        @type old:
        @arg new:
        @type new:
        @arg flag:
        @type flag:
        @arg nselector:
        @type nselector:
        """
        #query = '''UPDATE batch_queue_items
        #             SET item = REPLACE(item, :old, :new),
        #                 flags = flags || :flag
        #             WHERE batch_job_id = :batch_job_id
        #                   AND NOT item LIKE :nselector%'''
        #parameters = {'batch_job_id': jobID,
        #              'old': old,
        #              'new': new,
        #              'flag': flag,
        #              'nselector': nselector}
        #session.execute(query, parameters)
        BatchQueueItem.query \
            .filter_by(batch_job_id=jobID) \
            .filter(BatchQueueItem.item.startswith(old),
                    ~BatchQueueItem.item.startswith(nselector)) \
            .update({'item': func.replace(BatchQueueItem.item, old, new),
                     'flags': BatchQueueItem.flags + flag},
                    synchronize_session=False)
        session.commit()
Ejemplo n.º 18
0
    def downloadrecord(self, url) :
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg url: Location of a GenBank record
        @type url: unicode

        @return: UD or None
        @rtype: unicode
        """
        if not (url.startswith('http://') or
                url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                    "Only HTTP(S) or FTP locations are allowed.")
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calcHash(raw_data)

                UD = None

                try:
                    reference = Reference.query.filter_by(checksum=md5sum).one()
                except NoResultFound:
                    UD = self._newUD()
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                    if UD:      #Parsing went OK, add to DB
                        reference = Reference(UD, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(self._nametofile(reference.accession)):
                        UD = self.write(raw_data, reference.accession, 0) and reference.accession

                return UD #Returns the UD or None
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
                return None
            #else
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not a GenBank record.")
            return None
Ejemplo n.º 19
0
def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome,
            'refseq',
            record.source_accession,
            gene.name,
            orientation,
            start,
            stop, [start], [stop],
            'reference',
            cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()
Ejemplo n.º 20
0
def hg19():
    """
    Fixture for GRCh37/hg19 genome assembly with chromosomes.
    """
    assembly = Assembly('GRCh37', 9606, 'H**o sapiens', alias='hg19')
    session.add(assembly)

    session.add_all(
        Chromosome(assembly, name, accession, organelle)
        for accession, name, organelle in
        [('NC_000001.10', 'chr1',
          'nucleus'), ('NC_000002.11', 'chr2',
                       'nucleus'), ('NC_000003.11', 'chr3', 'nucleus'),
         ('NC_000004.11', 'chr4',
          'nucleus'), ('NC_000005.9', 'chr5',
                       'nucleus'), ('NC_000006.11', 'chr6', 'nucleus'),
         ('NC_000007.13', 'chr7',
          'nucleus'), ('NC_000008.10', 'chr8',
                       'nucleus'), ('NC_000009.11', 'chr9', 'nucleus'),
         ('NC_000010.10', 'chr10',
          'nucleus'), ('NC_000011.9', 'chr11',
                       'nucleus'), ('NC_000012.11', 'chr12', 'nucleus'),
         ('NC_000013.10', 'chr13',
          'nucleus'), ('NC_000014.8', 'chr14',
                       'nucleus'), ('NC_000015.9', 'chr15', 'nucleus'),
         ('NC_000016.9', 'chr16',
          'nucleus'), ('NC_000017.10', 'chr17',
                       'nucleus'), ('NC_000018.9', 'chr18', 'nucleus'),
         ('NC_000019.9', 'chr19',
          'nucleus'), ('NC_000020.10', 'chr20',
                       'nucleus'), ('NC_000021.8', 'chr21', 'nucleus'),
         ('NC_000022.10', 'chr22',
          'nucleus'), ('NC_000023.10', 'chrX',
                       'nucleus'), ('NC_000024.9', 'chrY', 'nucleus'),
         ('NT_167244.1', 'chr6_apd_hap1',
          'nucleus'), ('NT_113891.2', 'chr6_cox_hap2',
                       'nucleus'), ('NT_167245.1', 'chr6_dbb_hap3', 'nucleus'),
         ('NT_167246.1', 'chr6_mann_hap4',
          'nucleus'), ('NT_167247.1', 'chr6_mcf_hap5',
                       'nucleus'), ('NT_167248.1', 'chr6_qbl_hap6', 'nucleus'),
         ('NT_167249.1', 'chr6_ssto_hap7',
          'nucleus'), ('NT_167250.1', 'chr4_ctg9_hap1',
                       'nucleus'), (
                           'NT_167251.1', 'chr17_ctg5_hap1',
                           'nucleus'), ('NC_012920.1', 'chrM',
                                        'mitochondrion')])

    session.commit()
Ejemplo n.º 21
0
def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='******',
                                 host='genome-mysql.cse.ucsc.edu',
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
    parameters = gene,

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', acc, geneName, orientation, txStart + 1,
            txEnd, exon_starts, exon_stops, 'ucsc', cds=cds,
            version=int(version))
        session.add(mapping)

    session.commit()
Ejemplo n.º 22
0
def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', record.source_accession, gene.name,
            orientation, start, stop, [start], [stop], 'reference', cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()
Ejemplo n.º 23
0
def hg19():
    """
    Fixture for GRCh37/hg19 genome assembly with chromosomes.
    """
    assembly = Assembly('GRCh37', 9606, 'H**o sapiens', alias='hg19')
    session.add(assembly)

    session.add_all(Chromosome(assembly, name, accession, organelle)
                    for accession, name, organelle in [
            ('NC_000001.10', 'chr1', 'nucleus'),
            ('NC_000002.11', 'chr2', 'nucleus'),
            ('NC_000003.11', 'chr3', 'nucleus'),
            ('NC_000004.11', 'chr4', 'nucleus'),
            ('NC_000005.9', 'chr5', 'nucleus'),
            ('NC_000006.11', 'chr6', 'nucleus'),
            ('NC_000007.13', 'chr7', 'nucleus'),
            ('NC_000008.10', 'chr8', 'nucleus'),
            ('NC_000009.11', 'chr9', 'nucleus'),
            ('NC_000010.10', 'chr10', 'nucleus'),
            ('NC_000011.9', 'chr11', 'nucleus'),
            ('NC_000012.11', 'chr12', 'nucleus'),
            ('NC_000013.10', 'chr13', 'nucleus'),
            ('NC_000014.8', 'chr14', 'nucleus'),
            ('NC_000015.9', 'chr15', 'nucleus'),
            ('NC_000016.9', 'chr16', 'nucleus'),
            ('NC_000017.10', 'chr17', 'nucleus'),
            ('NC_000018.9', 'chr18', 'nucleus'),
            ('NC_000019.9', 'chr19', 'nucleus'),
            ('NC_000020.10', 'chr20', 'nucleus'),
            ('NC_000021.8', 'chr21', 'nucleus'),
            ('NC_000022.10', 'chr22', 'nucleus'),
            ('NC_000023.10', 'chrX', 'nucleus'),
            ('NC_000024.9', 'chrY', 'nucleus'),
            ('NT_167244.1', 'chr6_apd_hap1', 'nucleus'),
            ('NT_113891.2', 'chr6_cox_hap2', 'nucleus'),
            ('NT_167245.1', 'chr6_dbb_hap3', 'nucleus'),
            ('NT_167246.1', 'chr6_mann_hap4', 'nucleus'),
            ('NT_167247.1', 'chr6_mcf_hap5', 'nucleus'),
            ('NT_167248.1', 'chr6_qbl_hap6', 'nucleus'),
            ('NT_167249.1', 'chr6_ssto_hap7', 'nucleus'),
            ('NT_167250.1', 'chr4_ctg9_hap1', 'nucleus'),
            ('NT_167251.1', 'chr17_ctg5_hap1', 'nucleus'),
            ('NC_012920.1', 'chrM', 'mitochondrion')])

    session.commit()
Ejemplo n.º 24
0
def update_transcript_protein_link(transcript_accession,
                                   protein_accession=None):
    """
    Update cached link between a transcript and a protein, or create it if it
    doesn't exist yet.
    """
    link = TranscriptProteinLink.query \
        .filter_by(transcript_accession=transcript_accession) \
        .first()

    if link is not None:
        link.protein_accession = protein_accession
        link.added = datetime.now()
    else:
        link = TranscriptProteinLink(transcript_accession, protein_accession)
        session.add(link)

    session.commit()
Ejemplo n.º 25
0
    def cache_with_references():
        for reference in references:
            entry = REFERENCES[reference]
            try:
                accession = entry['accession']
            except KeyError:
                accession = reference
            geninfo_id = entry.get('geninfo_id')

            path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'data',
                                entry['filename'])
            shutil.copy(path, settings.CACHE_DIR)

            session.add(Reference(accession, entry['checksum'],
                                  geninfo_identifier=geninfo_id))

            for transcript, protein in entry.get('links', []):
                session.add(TranscriptProteinLink(transcript, protein))

        session.commit()
Ejemplo n.º 26
0
    def cache_with_references():
        for reference in references:
            entry = REFERENCES[reference]
            try:
                accession = entry['accession']
            except KeyError:
                accession = reference
            geninfo_id = entry.get('geninfo_id')

            path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'data', entry['filename'])
            shutil.copy(path, settings.CACHE_DIR)

            session.add(
                Reference(accession,
                          entry['checksum'],
                          geninfo_identifier=geninfo_id))

            for transcript, protein in entry.get('links', []):
                session.add(TranscriptProteinLink(transcript, protein))

        session.commit()
Ejemplo n.º 27
0
    def __skipBatchEntries(self, jobID, flag, selector):
        """
        Skip all batch entries that match a certain selector.

        We flag batch entries to be skipped. This is used if it is certain
        that an entry will cause an error, or that its output is ambiguous.

        @arg jobID:
        @type jobID:
        @arg flag:
        @type flag:
        @arg selector:
        @type selector:
        """
        #update `BatchQueue` set
        #  `Flags` = CONCAT(IFNULL(`Flags`, ""), %s)
        #  where `JobID` = %s AND
        #  `Input` RLIKE %s;
        BatchQueueItem.query \
            .filter_by(batch_job_id=jobID) \
            .filter(BatchQueueItem.item.startswith(selector)) \
            .update({'flags': BatchQueueItem.flags + flag},
                    synchronize_session=False)
        session.commit()
Ejemplo n.º 28
0
    def _updateDBmd5(self, raw_data, name, GI):
        #TODO documentation
        """
        @todo: documentation

        @arg raw_data:
        @type raw_data:
        @arg name:
        @type name:
        @arg GI:
        @type GI:

        @return: filename
        @rtype: unicode
        """
        try:
            reference = Reference.query.filter_by(accession=name).one()
            currentmd5sum = reference.checksum
        except NoResultFound:
            currentmd5sum = None

        if currentmd5sum :
            md5sum = self._calcHash(raw_data)
            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    name, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=name).update({'checksum': md5sum})
                session.commit()
            #if
        else :
            reference = Reference(name, self._calcHash(raw_data),
                                  geninfo_identifier=GI)
            session.add(reference)
            session.commit()
        return self._nametofile(name)
Ejemplo n.º 29
0
    def _updateDBmd5(self, raw_data, name, GI):
        #TODO documentation
        """
        @todo: documentation

        @arg raw_data:
        @type raw_data:
        @arg name:
        @type name:
        @arg GI:
        @type GI:

        @return: filename
        @rtype: unicode
        """
        try:
            reference = Reference.query.filter_by(accession=name).one()
            currentmd5sum = reference.checksum
        except NoResultFound:
            currentmd5sum = None

        if currentmd5sum :
            md5sum = self._calcHash(raw_data)
            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    name, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=name).update({'checksum': md5sum})
                session.commit()
            #if
        else :
            reference = Reference(name, self._calcHash(raw_data),
                                  geninfo_identifier=GI)
            session.add(reference)
            session.commit()
        return self._nametofile(name)
Ejemplo n.º 30
0
    def __skipBatchEntries(self, jobID, flag, selector) :
        """
        Skip all batch entries that match a certain selector.

        We flag batch entries to be skipped. This is used if it is certain
        that an entry will cause an error, or that its output is ambiguous.

        @arg jobID:
        @type jobID:
        @arg flag:
        @type flag:
        @arg selector:
        @type selector:
        """
        #update `BatchQueue` set
        #  `Flags` = CONCAT(IFNULL(`Flags`, ""), %s)
        #  where `JobID` = %s AND
        #  `Input` RLIKE %s;
        BatchQueueItem.query \
            .filter_by(batch_job_id=jobID) \
            .filter(BatchQueueItem.item.startswith(selector)) \
            .update({'flags': BatchQueueItem.flags + flag},
                    synchronize_session=False)
        session.commit()
Ejemplo n.º 31
0
def hg19_transcript_mappings():
    """
    Fixture for some selected transcript mappings in the GRCh37/hg19 genome
    assembly. Depends on the :func:`hg19` fixture.
    """
    chromosome_1 = Chromosome.query.filter_by(accession='NC_000001.10').one()
    chromosome_3 = Chromosome.query.filter_by(accession='NC_000003.11').one()
    chromosome_6 = Chromosome.query.filter_by(accession='NC_000006.11').one()
    chromosome_7 = Chromosome.query.filter_by(accession='NC_000007.13').one()
    chromosome_8 = Chromosome.query.filter_by(accession='NC_000008.10').one()
    chromosome_11 = Chromosome.query.filter_by(accession='NC_000011.9').one()
    chromosome_20 = Chromosome.query.filter_by(accession='NC_000020.10').one()
    chromosome_22 = Chromosome.query.filter_by(accession='NC_000022.10').one()
    chromosome_x = Chromosome.query.filter_by(accession='NC_000023.10').one()
    chromosome_mt = Chromosome.query.filter_by(accession='NC_012920.1').one()

    session.add_all([
        chromosome_1, chromosome_6, chromosome_8, chromosome_11, chromosome_20,
        chromosome_22, chromosome_mt
    ])

    session.add(
        TranscriptMapping(chromosome_11,
                          'refseq',
                          'NM_003002',
                          'SDHD',
                          'forward',
                          111957571,
                          111966518,
                          [111957571, 111958581, 111959591, 111965529],
                          [111957683, 111958697, 111959735, 111966518],
                          'ncbi',
                          transcript=1,
                          cds=(111957632, 111965694),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(chromosome_11,
                          'refseq',
                          'NM_012459',
                          'TIMM8B',
                          'reverse',
                          111955524,
                          111957522, [111955524, 111957364],
                          [111956186, 111957522],
                          'ncbi',
                          transcript=1,
                          cds=(111956019, 111957492),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(chromosome_11,
                          'refseq',
                          'NR_028383',
                          'TIMM8B',
                          'reverse',
                          111955524,
                          111957522, [111955524, 111956702, 111957364],
                          [111956186, 111957034, 111957522],
                          'ncbi',
                          transcript=1,
                          cds=None,
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(chromosome_6,
                          'refseq',
                          'NM_000500',
                          'CYP21A2',
                          'forward',
                          32006082,
                          32009419, [
                              32006082, 32006499, 32006871, 32007133, 32007323,
                              32007526, 32007782, 32008183, 32008445, 32008646
                          ], [
                              32006401, 32006588, 32007025, 32007234, 32007424,
                              32007612, 32007982, 32008361, 32008548, 32009419
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(32006200, 32008911),
                          select_transcript=False,
                          version=5))
    session.add(
        TranscriptMapping(chromosome_22,
                          'refseq',
                          'NM_001145134',
                          'CPT1B',
                          'reverse',
                          51007290,
                          51017096, [
                              51007290, 51007765, 51008005, 51008722, 51009320,
                              51009587, 51009804, 51010435, 51010632, 51011304,
                              51011949, 51012764, 51012922, 51014464, 51014627,
                              51015286, 51015753, 51016204, 51016978
                          ], [
                              51007510, 51007850, 51008097, 51008835, 51009472,
                              51009721, 51009968, 51010551, 51010737, 51011489,
                              51012144, 51012848, 51013029, 51014541, 51014764,
                              51015463, 51015892, 51016363, 51017096
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(51007767, 51016344),
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(chromosome_22,
                          'refseq',
                          'NR_021492',
                          'LOC100144603',
                          'forward',
                          51021455,
                          51022356, [51021455, 51022027], [51021752, 51022356],
                          'ncbi',
                          transcript=1,
                          cds=None,
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001007553',
            'CSDE1',
            'reverse',
            115259538,
            115300624, [
                115259538, 115261234, 115262200, 115263160, 115266504,
                115267842, 115268832, 115269604, 115272879, 115273129,
                115275225, 115276353, 115276610, 115277063, 115279379,
                115280092, 115280584, 115282313, 115292442, 115300546
            ], [
                115260837, 115261366, 115262363, 115263338, 115266623,
                115267954, 115269007, 115269711, 115273043, 115273269,
                115275437, 115276478, 115276738, 115277144, 115279476,
                115280184, 115280693, 115282511, 115292828, 115300624
            ],
            'ncbi',
            transcript=1,
            cds=(115260790, 115282511),
            select_transcript=False,
            version=1))
    session.add(
        TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001130523',
            'CSDE1',
            'reverse',
            115259538,
            115300671, [
                115259538, 115261234, 115262200, 115263160, 115266504,
                115267842, 115268832, 115269604, 115272879, 115273129,
                115275225, 115276353, 115276610, 115277063, 115279379,
                115280584, 115282313, 115284148, 115292442, 115300546
            ], [
                115260837, 115261366, 115262363, 115263338, 115266623,
                115267954, 115269007, 115269711, 115273043, 115273269,
                115275437, 115276478, 115276738, 115277144, 115279476,
                115280693, 115282511, 115284294, 115292828, 115300671
            ],
            'ncbi',
            transcript=1,
            cds=(115260790, 115284285),
            select_transcript=False,
            version=1))
    session.add(
        TranscriptMapping(chromosome_1,
                          'refseq',
                          'NM_002241',
                          'KCNJ10',
                          'reverse',
                          160007257,
                          160040051, [160007257, 160039812],
                          [160012322, 160040051],
                          'ncbi',
                          transcript=1,
                          cds=(160011183, 160012322),
                          select_transcript=False,
                          version=4))
    session.add(
        TranscriptMapping(
            chromosome_20,
            'refseq',
            'NM_001162505',
            'TMEM189',
            'reverse',
            48740274,
            48770335,
            [48740274, 48744512, 48746083, 48747402, 48760039, 48770054],
            [48741716, 48744724, 48746227, 48747484, 48760158, 48770335],
            'ncbi',
            transcript=1,
            cds=(48741595, 48770174),
            select_transcript=False,
            version=1))
    session.add(
        TranscriptMapping(
            chromosome_8,
            'refseq',
            'NM_017780',
            'CHD7',
            'forward',
            61591339,
            61779465, [
                61591339, 61653818, 61693559, 61707545, 61712947, 61714087,
                61720776, 61728946, 61732566, 61734349, 61734583, 61735062,
                61736399, 61741222, 61742881, 61748632, 61749376, 61750227,
                61750635, 61754203, 61754406, 61757423, 61757809, 61761074,
                61761610, 61763052, 61763591, 61763821, 61764578, 61765057,
                61765388, 61766922, 61768534, 61769004, 61773463, 61774755,
                61775107, 61777575
            ], [
                61591641, 61655656, 61693989, 61707686, 61713084, 61714152,
                61720831, 61729060, 61732649, 61734486, 61734704, 61735305,
                61736575, 61741365, 61743136, 61748842, 61749571, 61750394,
                61750814, 61754313, 61754611, 61757622, 61757968, 61761163,
                61761713, 61763181, 61763663, 61763878, 61764806, 61765265,
                61766059, 61767082, 61768761, 61769447, 61773684, 61774895,
                61775211, 61779465
            ],
            'ncbi',
            transcript=1,
            cds=(61653992, 61778492),
            select_transcript=False,
            version=2))
    session.add(
        TranscriptMapping(chromosome_mt,
                          'refseq',
                          'NC_012920',
                          'ND4',
                          'forward',
                          10760,
                          12137, [10760], [12137],
                          'reference',
                          transcript=1,
                          cds=(10760, 12137),
                          select_transcript=True,
                          version=1))
    session.add(
        TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_002001',
            'FCER1A',
            'forward',
            159259504,
            159278014,
            [159259504, 159272096, 159272644, 159273718, 159275778, 159277538],
            [159259543, 159272209, 159272664, 159273972, 159276035, 159278014],
            'ncbi',
            transcript=1,
            cds=(159272155, 159277722),
            select_transcript=False,
            version=2))
    session.add(
        TranscriptMapping(chromosome_7,
                          'refseq',
                          'XM_001715131',
                          'LOC100132858',
                          'reverse',
                          19828,
                          36378, [19828, 20834, 31060, 32957, 35335, 36224],
                          [19895, 21029, 31437, 33107, 35541, 36378],
                          'ncbi',
                          transcript=1,
                          cds=(19828, 36378),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004011',
            'DMD',
            'reverse',
            31137345,
            32430371, [
                31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
                31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
                31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
                31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
                31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
                31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
                32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
                32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
                32408188, 32429869, 32430279
            ], [
                31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
                31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
                31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
                31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
                31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
                31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
                32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
                32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
                32408298, 32430030, 32430371
            ],
            'ncbi',
            transcript=1,
            cds=(31140036, 32430326),
            select_transcript=False,
            version=3))
    session.add(
        TranscriptMapping(chromosome_x,
                          'refseq',
                          'NM_004019',
                          'DMD',
                          'reverse',
                          31196312,
                          31285024, [
                              31196312, 31198487, 31200855, 31222078, 31224699,
                              31227615, 31241164, 31279072, 31284927
                          ], [
                              31196922, 31198598, 31201021, 31222235, 31224784,
                              31227816, 31241238, 31279133, 31285024
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(31196782, 31284946),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004007',
            'DMD',
            'reverse',
            31137345,
            33038317, [
                31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
                31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
                31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
                31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
                31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
                31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
                32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
                32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
                32408188, 32429869, 32456358, 32459297, 32466573, 32472779,
                32481556, 32482703, 32486615, 32490281, 32503036, 32509394,
                32519872, 32536125, 32563276, 32583819, 32591647, 32591862,
                32613874, 32632420, 32662249, 32663081, 32715987, 32717229,
                32827610, 32834585, 32841412, 32862900, 32867845, 33038256
            ], [
                31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
                31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
                31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
                31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
                31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
                31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
                32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
                32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
                32408298, 32430030, 32456507, 32459431, 32466755, 32472949,
                32481711, 32482816, 32486827, 32490426, 32503216, 32509635,
                32519959, 32536248, 32563451, 32583998, 32591754, 32591963,
                32613993, 32632570, 32662430, 32663269, 32716115, 32717410,
                32827728, 32834757, 32841504, 32862977, 32867937, 33038317
            ],
            'ncbi',
            transcript=1,
            cds=(31140036, 32834745),
            select_transcript=False,
            version=2))
    session.add(
        TranscriptMapping(chromosome_x,
                          'refseq',
                          'NM_203473',
                          'PORCN',
                          'forward',
                          48367371,
                          48379202, [
                              48367371, 48368172, 48369683, 48370280, 48370714,
                              48370977, 48371223, 48372628, 48372913, 48374105,
                              48374278, 48374449, 48375571, 48378763
                          ], [
                              48367491, 48368344, 48369875, 48370323, 48370895,
                              48371107, 48371240, 48372753, 48373013, 48374181,
                              48374341, 48374534, 48375681, 48379202
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(48368209, 48378864),
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_000132',
            'F8',
            'reverse',
            154064063,
            154250998, [
                154064063, 154088707, 154089993, 154091358, 154124352,
                154128141, 154129646, 154130326, 154132181, 154132571,
                154133086, 154134695, 154156846, 154175973, 154182167,
                154185232, 154189350, 154194245, 154194701, 154197606,
                154212962, 154215512, 154221211, 154225248, 154227754,
                154250685
            ], [
                154066027, 154088883, 154090141, 154091502, 154124507,
                154128226, 154129717, 154130442, 154132363, 154132799,
                154133298, 154134848, 154159951, 154176182, 154182317,
                154185446, 154189443, 154194416, 154194962, 154197827,
                154213078, 154215580, 154221423, 154225370, 154227875,
                154250998
            ],
            'ncbi',
            transcript=1,
            cds=(154065872, 154250827),
            select_transcript=False,
            version=3))
    session.add(
        TranscriptMapping(chromosome_3,
                          'refseq',
                          'NM_000249',
                          'MLH1',
                          'forward',
                          37034841,
                          37092337, [
                              37034841, 37038110, 37042446, 37045892, 37048482,
                              37050305, 37053311, 37053502, 37055923, 37058997,
                              37061801, 37067128, 37070275, 37081677, 37083759,
                              37089010, 37090008, 37090395, 37091977
                          ], [
                              37035154, 37038200, 37042544, 37045965, 37048554,
                              37050396, 37053353, 37053590, 37056035, 37059090,
                              37061954, 37067498, 37070423, 37081785, 37083822,
                              37089174, 37090100, 37090508, 37092337
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(37035039, 37092144),
                          select_transcript=False,
                          version=3))

    session.commit()
Ejemplo n.º 32
0
def import_from_lrgmap_file(assembly, lrgmap_file):
    """
    Import transcript mappings from an EBI LRG transcripts map file.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = [
        'transcript', 'gene', 'chromosome', 'strand', 'start', 'stop', 'exons',
        'protein', 'cds_start', 'cds_stop'
    ]

    chromosomes = assembly.chromosomes.all()

    def read_mappings(lrgmap_file):
        for line in lrgmap_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip('\r\n').split('\t')))

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])
            try:
                record['cds_start'] = int(record['cds_start'])
            except ValueError:
                record['cds_start'] = None
            try:
                record['cds_stop'] = int(record['cds_stop'])
            except ValueError:
                record['cds_stop'] = None
            record['exons'] = [[int(pos) for pos in exon.split('-')]
                               for exon in record['exons'].split(',')]

            try:
                yield build_mapping(record)
            except ValueError:
                pass

    def build_mapping(record):
        # Only use records on chromosomes we know.
        try:
            chromosome = next(c for c in chromosomes
                              if c.name == 'chr' + record['chromosome'])
        except StopIteration:
            raise ValueError()

        accession, transcript = record['transcript'].split('t')
        transcript = int(transcript)

        orientation = 'reverse' if record['strand'] == '-1' else 'forward'

        if record['cds_start']:
            cds = record['cds_start'], record['cds_stop']
        else:
            cds = None

        # TODO: Also take protein into account. For example, in LRG_321 (TP53)
        # some transcripts occur twice (with different CDSs and different
        # protein numbers).
        # https://github.com/mutalyzer/mutalyzer/issues/372
        return TranscriptMapping.create_or_update(
            chromosome,
            'lrg',
            accession,
            record['gene'],
            orientation,
            record['start'],
            record['stop'], [start for start, _ in record['exons']],
            [stop for _, stop in record['exons']],
            'ebi',
            transcript=transcript,
            cds=cds,
            select_transcript=True)

    for mapping in read_mappings(lrgmap_file):
        session.add(mapping)

    session.commit()
Ejemplo n.º 33
0
def import_from_mapview_file(assembly, mapview_file, group_label):
    """
    Import transcript mappings from an NCBI mapview file.

    We require that this file is first sorted on the `feature_id` column
    (#11), which always contains the gene identifier, and then on the
    `chromosome` column (#2).

        sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md

    Raises :exc:`ValueError` if `mapview_file` is not sorted this way.

    The NCBI mapping file consists of entries, one per line, in order of
    their location in the genome (more specifically by start location).
    Every entry has a 'group_label' column, denoting the assembly it is
    from. We only use entries where this value is `group_label`.

    There are four types of entries (for our purposes):
    - Gene: Name, identifier, and location of a gene.
    - Transcript: Name, gene id, and location of a transcript.
    - UTR: Location and transcript of a non-coding exon (or part of it).
    - CDS: Location and transcript of a coding exon (or part of it).

    A bit troublesome for us is that exons are split in UTR exons and CDS
    exons, with exons overlapping the UTR/CDS border defined as two
    separate entries (one of type UTR and one of type CDS).

    Another minor annoyance is that some transcripts (~ 15) are split over
    two contigs (NT_*). In that case, they are defined by two entries in
    the file, where we should merge them by taking the start position of
    the first and the stop position of the second.

    To complicate this annoyance, some genes (e.g. in the PAR) are mapped
    on both the X and Y chromosomes, but stored in the file just like the
    transcripts split over two contigs. However, these ones should of
    course not be merged.

    Our strategy is too sort by gene and chromosome and process the file
    grouped by these two fields.

    For transcripts without any UTR and CDS entries (seems to happen for
    predicted genes), we generate one exon spanning the entire transcript.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
               'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
               'feature_name', 'feature_id', 'feature_type', 'group_label',
               'transcript', 'evidence_code']

    chromosomes = assembly.chromosomes.all()

    def read_records(mapview_file):
        for line in mapview_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip().split('\t')))

            # Only use records from the given assembly.
            if record['group_label'] != group_label:
                continue

            # Only use records on chromosomes we know.
            try:
                record['chromosome'] = next(c for c in chromosomes if
                                            c.name == 'chr' + record['chromosome'])
            except StopIteration:
                continue

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])

            yield record

    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
        by_transcript = defaultdict(lambda: defaultdict(list))
        for r in records:
            by_transcript[r['transcript']][r['feature_type']].append(r)

        gene = by_transcript['-']['GENE'][0]['feature_name']

        for transcript, by_type in by_transcript.items():
            if transcript == '-':
                continue
            accession, version = transcript.split('.')
            version = int(version)
            chromosome = by_type['RNA'][0]['chromosome']
            orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward'
            start = min(t['start'] for t in by_type['RNA'])
            stop = max(t['stop'] for t in by_type['RNA'])

            exon_starts = []
            exon_stops = []
            cds_positions = []
            for exon in sorted(by_type['UTR'] + by_type['CDS'],
                               key=itemgetter('start')):
                if exon_stops and exon_stops[-1] > exon['start'] - 1:
                    # This exon starts before the end of the previous exon. We
                    # have no idea what to do in this case, so we ignore it.
                    # The number of transcripts affected is very small (e.g.,
                    # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly).
                    continue
                if exon['feature_type'] == 'CDS':
                    cds_positions.extend([exon['start'], exon['stop']])
                if exon_stops and exon_stops[-1] == exon['start'] - 1:
                    # This exon must be merged with the previous one because
                    # it is split over two entries (a CDS part and a UTR part
                    # or split over different reference contigs).
                    exon_stops[-1] = exon['stop']
                else:
                    exon_starts.append(exon['start'])
                    exon_stops.append(exon['stop'])

            if cds_positions:
                cds = min(cds_positions), max(cds_positions)
            else:
                cds = None

            # If no exons are annotated, we create one spanning the entire
            # transcript.
            if not exon_starts:
                exon_starts = [start]
                exon_stops = [stop]

            yield TranscriptMapping.create_or_update(
                chromosome, 'refseq', accession, gene, orientation, start,
                stop, exon_starts, exon_stops, 'ncbi', cds=cds,
                version=version)

    processed_keys = set()

    for key, records in groupby(read_records(mapview_file),
                                itemgetter('feature_id', 'chromosome')):
        if key in processed_keys:
            raise MapviewSortError('Mapview file must be sorted by feature_id '
                                   'and chromosome (try `sort -k 11,11 -k '
                                   '2,2`)')
        processed_keys.add(key)

        for mapping in build_mappings(records):
            session.add(mapping)

    session.commit()
Ejemplo n.º 34
0
    def process(self):
        """
        Start the mutalyzer Batch Processing. This method retrieves all jobs
        jobs from the database and processes them in a roundrobin fashion.
        After each round, the process checks if new jobs are added during the
        last processing round and repeats. This continue until no jobs are
        left to process.

        If during this process the {stop} method is called, the current
        job item is completed and we return.

        This method uses two database tables, BatchJob and BatchQueue.

        The jobList is an array of tuples with three elements
            - jobID       ;   The ID of the job
            - jobType     ;   The type of the job
            - argument1   ;   Currently only used for the ConversionChecker
                            to send the build version.

        If the jobList is not empty, the method will iterate once over the
        list and fetch the first entry of a job from the database table
        BatchQueue. This request returns both the input for the batch and
        the flags for the job.

        #Flags
        A job can be flagged in three ways:
          - A       ;   Altered - this means that the input is altered
                        before execution. This could be the case if an
                        entry uses an accession number without a version.
                        If a version is retrieved from the NCBI, all
                        further occurences of that accession will be
                        replaced by the accession with version number.
          - S       ;   Skipped - this means that this batchentry will be
                        skipped by the batchprocess. This could be the
                        case if the user made a mistake that could not be
                        auto fixed and henceforth all occurences of the
                        mistake will be skipped.
          - C       ;   Continue - this means the input does not end the
                        current row, so no new row in the output should
                        be started.

        A Flag consists of either an A, S or C followed by a digit, which
        refers to the reason of alteration / skip.
        """
        while not self.stopped():
            batch_jobs = BatchJob.query

            if batch_jobs.count() == 0:
                break

            for batch_job in batch_jobs:
                if self.stopped():
                    break

                batch_queue_item = queries.pop_batch_queue_item(batch_job)

                if batch_queue_item is not None:
                    item, flags = batch_queue_item

                    if batch_job.job_type == 'name-checker':
                        self._processNameBatch(batch_job, item, flags)
                    elif batch_job.job_type == 'syntax-checker':
                        self._processSyntaxCheck(batch_job, item, flags)
                    elif batch_job.job_type == 'position-converter':
                        self._processConversion(batch_job, item, flags)
                    elif batch_job.job_type == 'snp-converter':
                        self._processSNP(batch_job, item, flags)
                    else:
                        # Unknown job type, should never happen.
                        # Todo: Log some screaming message.
                        pass

                else:
                    print('Job %s finished, email %s file %s' %
                          (batch_job.id, batch_job.email, batch_job.id))
                    self.__sendMail(batch_job.email, batch_job.download_url)
                    session.delete(batch_job)
                    session.commit()
Ejemplo n.º 35
0
    def retrieveslice(self, accno, start, stop, orientation) :
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        @arg accno: The accession number of the chromosome
        @type accno: unicode
        @arg start: Start position of the slice
        @type start: integer
        @arg stop: End position of the slice.
        @type stop: integer
        @arg orientation:
        Orientation of the slice:
            - 1 ; Forward
            - 2 ; Reverse complement
        @type orientation: integer

        @return: An UD number
        @rtype: unicode
        """

        # Not a valid slice.
        if start >= stop :
            return None

        # The slice can not be too big.
        if stop - start > settings.MAX_FILE_SIZE:
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._nametofile(reference.accession)) : # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text',
                                   id=accno, seq_start=start, seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Error connecting to Entrez nuccore database: %s' % unicode(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calcHash(raw_data)

        if reference is not None: # We have seen this one before.
            currentmd5sum = reference.checksum

            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    reference.accession, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum})
                session.commit()
            #if
        else : # We haven't seen it before, so give it a name.
            UD = self._newUD()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(UD, md5sum, slice_accession=accno,
                                  slice_start=start, slice_stop=stop,
                                  slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()
        #else

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Ejemplo n.º 36
0
def import_from_lrgmap_file(assembly, lrgmap_file):
    """
    Import transcript mappings from an EBI LRG transcripts map file.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = ['transcript', 'gene', 'chromosome', 'strand', 'start', 'stop',
               'exons', 'protein', 'cds_start', 'cds_stop']

    chromosomes = assembly.chromosomes.all()

    def read_mappings(lrgmap_file):
        for line in lrgmap_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip('\r\n').split('\t')))

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])
            try:
                record['cds_start'] = int(record['cds_start'])
            except ValueError:
                record['cds_start'] = None
            try:
                record['cds_stop'] = int(record['cds_stop'])
            except ValueError:
                record['cds_stop'] = None
            record['exons'] = [[int(pos) for pos in exon.split('-')]
                               for exon in record['exons'].split(',')]

            try:
                yield build_mapping(record)
            except ValueError:
                pass

    def build_mapping(record):
        # Only use records on chromosomes we know.
        try:
            chromosome = next(c for c in chromosomes if
                              c.name == 'chr' + record['chromosome'])
        except StopIteration:
            raise ValueError()

        accession, transcript = record['transcript'].split('t')
        transcript = int(transcript)

        orientation = 'reverse' if record['strand'] == '-1' else 'forward'

        if record['cds_start']:
            cds = record['cds_start'], record['cds_stop']
        else:
            cds = None

        # TODO: Also take protein into account. For example, in LRG_321 (TP53)
        # some transcripts occur twice (with different CDSs and different
        # protein numbers).
        # https://github.com/mutalyzer/mutalyzer/issues/372
        return TranscriptMapping.create_or_update(
            chromosome, 'lrg', accession, record['gene'], orientation,
            record['start'], record['stop'],
            [start for start, _ in record['exons']],
            [stop for _, stop in record['exons']],
            'ebi', transcript=transcript, cds=cds, select_transcript=True)

    for mapping in read_mappings(lrgmap_file):
        session.add(mapping)

    session.commit()
Ejemplo n.º 37
0
    def downloadrecord(self, url, name=None):
        """
        Download an LRG record from an URL.

        :arg unicode url: Location of the LRG record.

        :returns: The full path to the file or Nonein case of failure.
        :rtype: unicode
        """
        lrg_id = name or os.path.splitext(os.path.split(url)[1])[0]
        # if not lrg_id.startswith('LRG'):
        #     return None
        filename = self._name_to_file(lrg_id)

        # TODO: Properly read the file contents to a unicode string and write
        # it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()

        if (info['Content-Type'] == 'application/xml' and
                'Content-length' in info):
            # Looks like a valid LRG file.

            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                # Do an md5 check.
                md5sum = self._calculate_hash(raw_data)
                try:
                    reference = Reference.query.filter_by(
                        accession=lrg_id).one()
                    md5_db = reference.checksum
                except NoResultFound:
                    md5_db = None

                if md5_db is None:
                    reference = Reference(lrg_id, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5_db:
                    # Hash has changed for the LRG ID.
                    self._output.addMessage(
                        __file__, -1, 'WHASH',
                        'Warning: Hash of {} changed from {} to {}.'.format(
                            lrg_id, md5_db, md5sum))
                    Reference.query.filter_by(accession=lrg_id).update(
                        {'checksum': md5sum})
                    session.commit()
                else:
                    # Hash the same as in db.
                    pass

                if not os.path.isfile(filename):
                    return self.write(raw_data, lrg_id)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
        else:
            self._output.addMessage(
                __file__, 4, 'ERECPARSE', 'This is not an LRG record.')
        handle.close()
Ejemplo n.º 38
0
def hg19_transcript_mappings():
    """
    Fixture for some selected transcript mappings in the GRCh37/hg19 genome
    assembly. Depends on the :func:`hg19` fixture.
    """
    chromosome_1 = Chromosome.query.filter_by(accession='NC_000001.10').one()
    chromosome_3 = Chromosome.query.filter_by(accession='NC_000003.11').one()
    chromosome_6 = Chromosome.query.filter_by(accession='NC_000006.11').one()
    chromosome_7 = Chromosome.query.filter_by(accession='NC_000007.13').one()
    chromosome_8 = Chromosome.query.filter_by(accession='NC_000008.10').one()
    chromosome_11 = Chromosome.query.filter_by(accession='NC_000011.9').one()
    chromosome_20 = Chromosome.query.filter_by(accession='NC_000020.10').one()
    chromosome_22 = Chromosome.query.filter_by(accession='NC_000022.10').one()
    chromosome_x = Chromosome.query.filter_by(accession='NC_000023.10').one()
    chromosome_mt = Chromosome.query.filter_by(accession='NC_012920.1').one()

    session.add_all([chromosome_1, chromosome_6, chromosome_8, chromosome_11,
                     chromosome_20, chromosome_22, chromosome_mt])

    session.add(TranscriptMapping(
            chromosome_11,
            'refseq',
            'NM_003002',
            'SDHD',
            'forward',
            111957571,
            111966518,
            [111957571, 111958581, 111959591, 111965529],
            [111957683, 111958697, 111959735, 111966518],
            'ncbi',
            transcript=1,
            cds=(111957632, 111965694),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_11,
            'refseq',
            'NM_012459',
            'TIMM8B',
            'reverse',
            111955524,
            111957522,
            [111955524, 111957364],
            [111956186, 111957522],
            'ncbi',
            transcript=1,
            cds=(111956019, 111957492),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_11,
            'refseq',
            'NR_028383',
            'TIMM8B',
            'reverse',
            111955524,
            111957522,
            [111955524, 111956702, 111957364],
            [111956186, 111957034, 111957522],
            'ncbi',
            transcript=1,
            cds=None,
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_6,
            'refseq',
            'NM_000500',
            'CYP21A2',
            'forward',
            32006082,
            32009419,
            [32006082, 32006499, 32006871, 32007133, 32007323, 32007526,
             32007782, 32008183, 32008445, 32008646],
            [32006401, 32006588, 32007025, 32007234, 32007424, 32007612,
             32007982, 32008361, 32008548, 32009419],
            'ncbi',
            transcript=1,
            cds=(32006200, 32008911),
            select_transcript=False,
            version=5))
    session.add(TranscriptMapping(
            chromosome_22,
            'refseq',
            'NM_001145134',
            'CPT1B',
            'reverse',
            51007290,
            51017096,
            [51007290, 51007765, 51008005, 51008722, 51009320, 51009587,
             51009804, 51010435, 51010632, 51011304, 51011949, 51012764,
             51012922, 51014464, 51014627, 51015286, 51015753, 51016204,
             51016978],
            [51007510, 51007850, 51008097, 51008835, 51009472, 51009721,
             51009968, 51010551, 51010737, 51011489, 51012144, 51012848,
             51013029, 51014541, 51014764, 51015463, 51015892, 51016363,
             51017096],
            'ncbi',
            transcript=1,
            cds=(51007767, 51016344),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_22,
            'refseq',
            'NR_021492',
            'LOC100144603',
            'forward',
            51021455,
            51022356,
            [51021455, 51022027],
            [51021752, 51022356],
            'ncbi',
            transcript=1,
            cds=None,
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001007553',
            'CSDE1',
            'reverse',
            115259538,
            115300624,
            [115259538, 115261234, 115262200, 115263160, 115266504, 115267842,
             115268832, 115269604, 115272879, 115273129, 115275225, 115276353,
             115276610, 115277063, 115279379, 115280092, 115280584, 115282313,
             115292442, 115300546],
            [115260837, 115261366, 115262363, 115263338, 115266623, 115267954,
             115269007, 115269711, 115273043, 115273269, 115275437, 115276478,
             115276738, 115277144, 115279476, 115280184, 115280693, 115282511,
             115292828, 115300624],
            'ncbi',
            transcript=1,
            cds=(115260790, 115282511),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001130523',
            'CSDE1',
            'reverse',
            115259538,
            115300671,
            [115259538, 115261234, 115262200, 115263160, 115266504, 115267842,
             115268832, 115269604, 115272879, 115273129, 115275225, 115276353,
             115276610, 115277063, 115279379, 115280584, 115282313, 115284148,
             115292442, 115300546],
            [115260837, 115261366, 115262363, 115263338, 115266623, 115267954,
             115269007, 115269711, 115273043, 115273269, 115275437, 115276478,
             115276738, 115277144, 115279476, 115280693, 115282511, 115284294,
             115292828, 115300671],
            'ncbi',
            transcript=1,
            cds=(115260790, 115284285),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_002241',
            'KCNJ10',
            'reverse',
            160007257,
            160040051,
            [160007257, 160039812],
            [160012322, 160040051],
            'ncbi',
            transcript=1,
            cds=(160011183, 160012322),
            select_transcript=False,
            version=4))
    session.add(TranscriptMapping(
            chromosome_20,
            'refseq',
            'NM_001162505',
            'TMEM189',
            'reverse',
            48740274,
            48770335,
            [48740274, 48744512, 48746083, 48747402, 48760039, 48770054],
            [48741716, 48744724, 48746227, 48747484, 48760158, 48770335],
            'ncbi',
            transcript=1,
            cds=(48741595, 48770174),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_8,
            'refseq',
            'NM_017780',
            'CHD7',
            'forward',
            61591339,
            61779465,
            [61591339, 61653818, 61693559, 61707545, 61712947, 61714087,
             61720776, 61728946, 61732566, 61734349, 61734583, 61735062,
             61736399, 61741222, 61742881, 61748632, 61749376, 61750227,
             61750635, 61754203, 61754406, 61757423, 61757809, 61761074,
             61761610, 61763052, 61763591, 61763821, 61764578, 61765057,
             61765388, 61766922, 61768534, 61769004, 61773463, 61774755,
             61775107, 61777575],
            [61591641, 61655656, 61693989, 61707686, 61713084, 61714152,
             61720831, 61729060, 61732649, 61734486, 61734704, 61735305,
             61736575, 61741365, 61743136, 61748842, 61749571, 61750394,
             61750814, 61754313, 61754611, 61757622, 61757968, 61761163,
             61761713, 61763181, 61763663, 61763878, 61764806, 61765265,
             61766059, 61767082, 61768761, 61769447, 61773684, 61774895,
             61775211, 61779465],
            'ncbi',
            transcript=1,
            cds=(61653992, 61778492),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_mt,
            'refseq',
            'NC_012920',
            'ND4',
            'forward',
            10760,
            12137,
            [10760],
            [12137],
            'reference',
            transcript=1,
            cds=(10760, 12137),
            select_transcript=True,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_002001',
            'FCER1A',
            'forward',
            159259504,
            159278014,
            [159259504, 159272096, 159272644, 159273718, 159275778, 159277538],
            [159259543, 159272209, 159272664, 159273972, 159276035, 159278014],
            'ncbi',
            transcript=1,
            cds=(159272155, 159277722),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_7,
            'refseq',
            'XM_001715131',
            'LOC100132858',
            'reverse',
            19828,
            36378,
            [19828, 20834, 31060, 32957, 35335, 36224],
            [19895, 21029, 31437, 33107, 35541, 36378],
            'ncbi',
            transcript=1,
            cds=(19828, 36378),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004011',
            'DMD',
            'reverse',
            31137345,
            32430371,
            [31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
             31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
             31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
             31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
             31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
             31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
             32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
             32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
             32408188, 32429869, 32430279],
            [31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
             31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
             31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
             31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
             31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
             31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
             32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
             32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
             32408298, 32430030, 32430371],
            'ncbi',
            transcript=1,
            cds=(31140036, 32430326),
            select_transcript=False,
            version=3))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004019',
            'DMD',
            'reverse',
            31196312,
            31285024,
            [31196312, 31198487, 31200855, 31222078, 31224699, 31227615,
             31241164, 31279072, 31284927],
            [31196922, 31198598, 31201021, 31222235, 31224784, 31227816,
             31241238, 31279133, 31285024],
            'ncbi',
            transcript=1,
            cds=(31196782, 31284946),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004007',
            'DMD',
            'reverse',
            31137345,
            33038317,
            [31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
             31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
             31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
             31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
             31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
             31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
             32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
             32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
             32408188, 32429869, 32456358, 32459297, 32466573, 32472779,
             32481556, 32482703, 32486615, 32490281, 32503036, 32509394,
             32519872, 32536125, 32563276, 32583819, 32591647, 32591862,
             32613874, 32632420, 32662249, 32663081, 32715987, 32717229,
             32827610, 32834585, 32841412, 32862900, 32867845, 33038256],
            [31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
             31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
             31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
             31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
             31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
             31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
             32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
             32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
             32408298, 32430030, 32456507, 32459431, 32466755, 32472949,
             32481711, 32482816, 32486827, 32490426, 32503216, 32509635,
             32519959, 32536248, 32563451, 32583998, 32591754, 32591963,
             32613993, 32632570, 32662430, 32663269, 32716115, 32717410,
             32827728, 32834757, 32841504, 32862977, 32867937, 33038317],
            'ncbi',
            transcript=1,
            cds=(31140036, 32834745),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_203473',
            'PORCN',
            'forward',
            48367371,
            48379202,
            [48367371, 48368172, 48369683, 48370280, 48370714, 48370977,
             48371223, 48372628, 48372913, 48374105, 48374278, 48374449,
             48375571, 48378763],
            [48367491, 48368344, 48369875, 48370323, 48370895, 48371107,
             48371240, 48372753, 48373013, 48374181, 48374341, 48374534,
             48375681, 48379202],
            'ncbi',
            transcript=1,
            cds=(48368209, 48378864),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_000132',
            'F8',
            'reverse',
            154064063,
            154250998,
            [154064063, 154088707, 154089993, 154091358, 154124352, 154128141,
             154129646, 154130326, 154132181, 154132571, 154133086, 154134695,
             154156846, 154175973, 154182167, 154185232, 154189350, 154194245,
             154194701, 154197606, 154212962, 154215512, 154221211, 154225248,
             154227754, 154250685],
            [154066027, 154088883, 154090141, 154091502, 154124507, 154128226,
             154129717, 154130442, 154132363, 154132799, 154133298, 154134848,
             154159951, 154176182, 154182317, 154185446, 154189443, 154194416,
             154194962, 154197827, 154213078, 154215580, 154221423, 154225370,
             154227875, 154250998],
            'ncbi',
            transcript=1,
            cds=(154065872, 154250827),
            select_transcript=False,
            version=3))
    session.add(TranscriptMapping(
            chromosome_3,
            'refseq',
            'NM_000249',
            'MLH1',
            'forward',
            37034841,
            37092337,
            [37034841, 37038110, 37042446, 37045892, 37048482, 37050305,
             37053311, 37053502, 37055923, 37058997, 37061801, 37067128,
             37070275, 37081677, 37083759, 37089010, 37090008, 37090395,
             37091977],
            [37035154, 37038200, 37042544, 37045965, 37048554, 37050396,
             37053353, 37053590, 37056035, 37059090, 37061954, 37067498,
             37070423, 37081785, 37083822, 37089174, 37090100, 37090508,
             37092337],
            'ncbi',
            transcript=1,
            cds=(37035039, 37092144),
            select_transcript=False,
            version=3))

    session.commit()
Ejemplo n.º 39
0
    def retrieveslice(self, accno, start, stop, orientation):
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        :arg unicode accno: The accession number of the chromosome.
        :arg int start: Start position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int stop: End position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int orientation: Orientation of the slice:
            - 1 ; Forward.
            - 2 ; Reverse complement.

        :returns: An UD number.
        :rtype: unicode
        """
        # Not a valid slice.
        if start > stop:
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice for start '
                                    'position greater than stop position.')
            return None

        # The slice can not be too big.
        if stop - start + 1 > settings.MAX_FILE_SIZE:
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice (request '
                                    'exceeds maximum of %d bases)' %
                                    settings.MAX_FILE_SIZE)
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
            # in reference orientation.
            handle = Entrez.efetch(
                db='nuccore', rettype='gb', retmode='text', id=accno,
                seq_start=start, seq_stop=stop, strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(
                __file__, -1, 'INFO',
                'Error connecting to Entrez nuccore database: {}'.format(
                    unicode(e)))
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calculate_hash(raw_data)

        if reference is not None:
            # We have seen this one before.
            current_md5sum = reference.checksum

            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        reference.accession, current_md5sum, md5sum))
                Reference.query.filter_by(
                    accession=reference.accession).update({'checksum': md5sum})
                session.commit()
        else:
            # We haven't seen it before, so give it a name.
            ud = self._new_ud()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(
                ud, md5sum, slice_accession=accno, slice_start=start,
                slice_stop=stop, slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Ejemplo n.º 40
0
    def process(self):
        """
        Start the mutalyzer Batch Processing. This method retrieves all jobs
        jobs from the database and processes them in a roundrobin fashion.
        After each round, the process checks if new jobs are added during the
        last processing round and repeats. This continue until no jobs are
        left to process.

        If during this process the {stop} method is called, the current
        job item is completed and we return.

        This method uses two database tables, BatchJob and BatchQueue.

        The jobList is an array of tuples with three elements
            - jobID       ;   The ID of the job
            - jobType     ;   The type of the job
            - argument1   ;   Currently only used for the ConversionChecker
                            to send the build version.

        If the jobList is not empty, the method will iterate once over the
        list and fetch the first entry of a job from the database table
        BatchQueue. This request returns both the input for the batch and
        the flags for the job.

        #Flags
        A job can be flagged in three ways:
          - A       ;   Altered - this means that the input is altered
                        before execution. This could be the case if an
                        entry uses an accession number without a version.
                        If a version is retrieved from the NCBI, all
                        further occurences of that accession will be
                        replaced by the accession with version number.
          - S       ;   Skipped - this means that this batchentry will be
                        skipped by the batchprocess. This could be the
                        case if the user made a mistake that could not be
                        auto fixed and henceforth all occurences of the
                        mistake will be skipped.
          - C       ;   Continue - this means the input does not end the
                        current row, so no new row in the output should
                        be started.

        A Flag consists of either an A, S or C followed by a digit, which
        refers to the reason of alteration / skip.
        """
        while not self.stopped():
            batch_jobs = BatchJob.query

            if batch_jobs.count() == 0:
                break

            for batch_job in batch_jobs:
                if self.stopped():
                    break

                batch_queue_item = queries.pop_batch_queue_item(batch_job)

                if batch_queue_item is not None:
                    item, flags = batch_queue_item

                    if batch_job.job_type == 'name-checker':
                        self._processNameBatch(batch_job, item, flags)
                    elif batch_job.job_type == 'syntax-checker':
                        self._processSyntaxCheck(batch_job, item, flags)
                    elif batch_job.job_type == 'position-converter':
                        self._processConversion(batch_job, item, flags)
                    elif batch_job.job_type == 'snp-converter':
                        self._processSNP(batch_job, item, flags)
                    else:
                        # Unknown job type, should never happen.
                        # Todo: Log some screaming message.
                        pass

                else:
                    print ('Job %s finished, email %s file %s'
                           % (batch_job.id, batch_job.email, batch_job.id))
                    self.__sendMail(batch_job.email, batch_job.download_url)
                    session.delete(batch_job)
                    session.commit()
Ejemplo n.º 41
0
    def retrieveslice(self, accno, start, stop, orientation) :
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        @arg accno: The accession number of the chromosome
        @type accno: unicode
        @arg start: Start position of the slice
        @type start: integer
        @arg stop: End position of the slice.
        @type stop: integer
        @arg orientation:
        Orientation of the slice:
            - 1 ; Forward
            - 2 ; Reverse complement
        @type orientation: integer

        @return: An UD number
        @rtype: unicode
        """

        # Not a valid slice.
        if start >= stop :
            return None

        # The slice can not be too big.
        if stop - start > settings.MAX_FILE_SIZE:
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._nametofile(reference.accession)) : # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text',
                                   id=accno, seq_start=start, seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Error connecting to Entrez nuccore database: %s' % unicode(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calcHash(raw_data)

        if reference is not None: # We have seen this one before.
            currentmd5sum = reference.checksum

            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    reference.accession, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum})
                session.commit()
            #if
        else : # We haven't seen it before, so give it a name.
            UD = self._newUD()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(UD, md5sum, slice_accession=accno,
                                  slice_start=start, slice_stop=stop,
                                  slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()
        #else

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Ejemplo n.º 42
0
    def downloadrecord(self, url, name=None):
        """
        Download an LRG record from an URL.

        :arg unicode url: Location of the LRG record.

        :returns: The full path to the file or Nonein case of failure.
        :rtype: unicode
        """
        lrg_id = name or os.path.splitext(os.path.split(url)[1])[0]
        # if not lrg_id.startswith('LRG'):
        #     return None
        filename = self._name_to_file(lrg_id)

        # TODO: Properly read the file contents to a unicode string and write
        # it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()

        if (info['Content-Type'] == 'application/xml'
                and 'Content-length' in info):
            # Looks like a valid LRG file.

            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                # Do an md5 check.
                md5sum = self._calculate_hash(raw_data)
                try:
                    reference = Reference.query.filter_by(
                        accession=lrg_id).one()
                    md5_db = reference.checksum
                except NoResultFound:
                    md5_db = None

                if md5_db is None:
                    # Note: The abstraction seems a bit off here, but we
                    # prefer to set `Reference.source` to `lrg` and not to
                    # `url`, since the former is more specific.
                    reference = Reference(lrg_id, md5sum, 'lrg')
                    session.add(reference)
                    session.commit()
                elif md5sum != md5_db:
                    # Hash has changed for the LRG ID.
                    self._output.addMessage(
                        __file__, -1, 'WHASH',
                        'Warning: Hash of {} changed from {} to {}.'.format(
                            lrg_id, md5_db, md5sum))
                    Reference.query.filter_by(accession=lrg_id).update(
                        {'checksum': md5sum})
                    session.commit()
                else:
                    # Hash the same as in db.
                    pass

                if not os.path.isfile(filename):
                    return self.write(raw_data, lrg_id)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
        else:
            self._output.addMessage(__file__, 4, 'ERECPARSE',
                                    'This is not an LRG record.')
        handle.close()
Ejemplo n.º 43
0
    def downloadrecord(self, url, name = None) :
        """
        Download an LRG record from an URL.

        @arg url: Location of the LRG record
        @type url: unicode

        @return:
            - filename    ; The full path to the file
            - None        ; in case of failure
        @rtype: unicode
        """

        lrgID = name or os.path.splitext(os.path.split(url)[1])[0]
        #if not lrgID.startswith("LRG"):
        #    return None
        filename = self._nametofile(lrgID)

        # Todo: Properly read the file contents to a unicode string and write
        #   it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "application/xml" and info.has_key("Content-length"):

            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                #Do an md5 check
                md5sum = self._calcHash(raw_data)
                try:
                    reference = Reference.query.filter_by(accession=lrgID).one()
                    md5db = reference.checksum
                except NoResultFound:
                    md5db = None

                if md5db is None:
                    reference = Reference(lrgID, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5db:       #hash has changed for the LRG ID
                    self._output.addMessage(__file__, -1, "WHASH",
                        "Warning: Hash of %s changed from %s to %s." % (
                        lrgID, md5db, md5sum))
                    Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum})
                    session.commit()
                else:                       #hash the same as in db
                    pass

                if not os.path.isfile(filename) :
                    return self.write(raw_data, lrgID)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not an LRG record.")
        handle.close()
Ejemplo n.º 44
0
    def downloadrecord(self, url, name = None) :
        """
        Download an LRG record from an URL.

        @arg url: Location of the LRG record
        @type url: unicode

        @return:
            - filename    ; The full path to the file
            - None        ; in case of failure
        @rtype: unicode
        """

        lrgID = name or os.path.splitext(os.path.split(url)[1])[0]
        #if not lrgID.startswith("LRG"):
        #    return None
        filename = self._nametofile(lrgID)

        # Todo: Properly read the file contents to a unicode string and write
        #   it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "application/xml" and info.has_key("Content-length"):

            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                #Do an md5 check
                md5sum = self._calcHash(raw_data)
                try:
                    reference = Reference.query.filter_by(accession=lrgID).one()
                    md5db = reference.checksum
                except NoResultFound:
                    md5db = None

                if md5db is None:
                    reference = Reference(lrgID, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5db:       #hash has changed for the LRG ID
                    self._output.addMessage(__file__, -1, "WHASH",
                        "Warning: Hash of %s changed from %s to %s." % (
                        lrgID, md5db, md5sum))
                    Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum})
                    session.commit()
                else:                       #hash the same as in db
                    pass

                if not os.path.isfile(filename) :
                    return self.write(raw_data, lrgID)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not an LRG record.")
        handle.close()
Ejemplo n.º 45
0
def import_from_mapview_file(assembly, mapview_file, group_label):
    """
    Import transcript mappings from an NCBI mapview file.

    We require that this file is first sorted on the `feature_id` column
    (#11), which always contains the gene identifier, and then on the
    `chromosome` column (#2).

        sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md

    Raises :exc:`ValueError` if `mapview_file` is not sorted this way.

    The NCBI mapping file consists of entries, one per line, in order of
    their location in the genome (more specifically by start location).
    Every entry has a 'group_label' column, denoting the assembly it is
    from. We only use entries where this value is `group_label`.

    There are four types of entries (for our purposes):
    - Gene: Name, identifier, and location of a gene.
    - Transcript: Name, gene id, and location of a transcript.
    - UTR: Location and transcript of a non-coding exon (or part of it).
    - CDS: Location and transcript of a coding exon (or part of it).

    A bit troublesome for us is that exons are split in UTR exons and CDS
    exons, with exons overlapping the UTR/CDS border defined as two
    separate entries (one of type UTR and one of type CDS).

    Another minor annoyance is that some transcripts (~ 15) are split over
    two contigs (NT_*). In that case, they are defined by two entries in
    the file, where we should merge them by taking the start position of
    the first and the stop position of the second.

    To complicate this annoyance, some genes (e.g. in the PAR) are mapped
    on both the X and Y chromosomes, but stored in the file just like the
    transcripts split over two contigs. However, these ones should of
    course not be merged.

    Our strategy is too sort by gene and chromosome and process the file
    grouped by these two fields.

    For transcripts without any UTR and CDS entries (seems to happen for
    predicted genes), we generate one exon spanning the entire transcript.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = [
        'taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig',
        'ctg_start', 'ctg_stop', 'ctg_orientation', 'feature_name',
        'feature_id', 'feature_type', 'group_label', 'transcript',
        'evidence_code'
    ]

    chromosomes = assembly.chromosomes.all()

    def read_records(mapview_file):
        for line in mapview_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip().split('\t')))

            # Only use records from the given assembly.
            if record['group_label'] != group_label:
                continue

            # Only use records on chromosomes we know.
            try:
                record['chromosome'] = next(c for c in chromosomes
                                            if c.name == 'chr' +
                                            record['chromosome'])
            except StopIteration:
                continue

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])

            yield record

    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
        by_transcript = defaultdict(lambda: defaultdict(list))
        for r in records:
            by_transcript[r['transcript']][r['feature_type']].append(r)

        gene = by_transcript['-']['GENE'][0]['feature_name']

        for transcript, by_type in by_transcript.items():
            if transcript == '-':
                continue
            accession, version = transcript.split('.')
            version = int(version)
            chromosome = by_type['RNA'][0]['chromosome']
            orientation = 'reverse' if by_type['RNA'][0][
                'orientation'] == '-' else 'forward'
            start = min(t['start'] for t in by_type['RNA'])
            stop = max(t['stop'] for t in by_type['RNA'])

            exon_starts = []
            exon_stops = []
            cds_positions = []
            for exon in sorted(by_type['UTR'] + by_type['CDS'],
                               key=itemgetter('start')):
                if exon_stops and exon_stops[-1] > exon['start'] - 1:
                    # This exon starts before the end of the previous exon. We
                    # have no idea what to do in this case, so we ignore it.
                    # The number of transcripts affected is very small (e.g.,
                    # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly).
                    continue
                if exon['feature_type'] == 'CDS':
                    cds_positions.extend([exon['start'], exon['stop']])
                if exon_stops and exon_stops[-1] == exon['start'] - 1:
                    # This exon must be merged with the previous one because
                    # it is split over two entries (a CDS part and a UTR part
                    # or split over different reference contigs).
                    exon_stops[-1] = exon['stop']
                else:
                    exon_starts.append(exon['start'])
                    exon_stops.append(exon['stop'])

            if cds_positions:
                cds = min(cds_positions), max(cds_positions)
            else:
                cds = None

            # If no exons are annotated, we create one spanning the entire
            # transcript.
            if not exon_starts:
                exon_starts = [start]
                exon_stops = [stop]

            yield TranscriptMapping.create_or_update(chromosome,
                                                     'refseq',
                                                     accession,
                                                     gene,
                                                     orientation,
                                                     start,
                                                     stop,
                                                     exon_starts,
                                                     exon_stops,
                                                     'ncbi',
                                                     cds=cds,
                                                     version=version)

    processed_keys = set()

    for key, records in groupby(read_records(mapview_file),
                                itemgetter('feature_id', 'chromosome')):
        if key in processed_keys:
            raise MapviewSortError('Mapview file must be sorted by feature_id '
                                   'and chromosome (try `sort -k 11,11 -k '
                                   '2,2`)')
        processed_keys.add(key)

        for mapping in build_mappings(records):
            session.add(mapping)

    session.commit()
Ejemplo n.º 46
0
    def retrieveslice(self, accno, start, stop, orientation):
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        :arg unicode accno: The accession number of the chromosome.
        :arg int start: Start position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int stop: End position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int orientation: Orientation of the slice:
            - 1 ; Forward.
            - 2 ; Reverse complement.

        :returns: An UD number.
        :rtype: unicode
        """
        # Not a valid slice.
        if start > stop:
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice for start '
                'position greater than stop position.')
            return None

        # The slice can not be too big.
        if stop - start + 1 > settings.MAX_FILE_SIZE:
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice (request '
                'exceeds maximum of %d bases)' % settings.MAX_FILE_SIZE)
            return None

        # Value of the Reference.source_data field for this slice.
        source_data = '{}:{}:{}:{}'.format(accno, start, stop,
                                           ['forward',
                                            'reverse'][orientation - 1])

        # Check whether we have seen this slice before.
        reference = Reference.query.filter_by(source='ncbi_slice',
                                              source_data=source_data).first()
        if reference and os.path.isfile(self._name_to_file(
                reference.accession)):
            # It's still present.
            return reference.accession

        # It's not present, so download it.
        try:
            # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
            # in reference orientation.
            handle = Entrez.efetch(db='nuccore',
                                   rettype='gbwithparts',
                                   retmode='text',
                                   id=accno,
                                   seq_start=start,
                                   seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(
                __file__, -1, 'INFO',
                'Error connecting to Entrez nuccore database: {}'.format(
                    unicode(e)))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calculate_hash(raw_data)

        if reference is not None:
            # We have seen this one before.
            current_md5sum = reference.checksum

            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        reference.accession, current_md5sum, md5sum))
                Reference.query.filter_by(
                    accession=reference.accession).update({'checksum': md5sum})
                session.commit()
        else:
            # We haven't seen it before, so give it a name.
            ud = self._new_ud()
            reference = Reference(ud,
                                  md5sum,
                                  source='ncbi_slice',
                                  source_data=source_data)
            session.add(reference)
            session.commit()

        if self.write(raw_data, reference.accession, 0):
            return reference.accession