Esempio n. 1
0
    def uploadrecord(self, raw_data) :
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg raw_data: A GenBank record.
        @type raw_data: byte string

        @return: Accession number for the uploaded file.
        @rtype: unicode
        """
        md5sum = self._calcHash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            UD = self._newUD()
            if self.write(raw_data, UD, 0):
                reference = Reference(UD, md5sum)
                session.add(reference)
                session.commit()
                return UD
        else:
            if os.path.isfile(self._nametofile(reference.accession)):
                return reference.accession
            else:
                return self.write(raw_data, reference.accession, 0) and reference.accession
Esempio n. 2
0
    def addJob(self,
               email,
               queue,
               columns,
               job_type,
               argument=None,
               create_download_url=None):
        """
        Add a job to the Database and start the BatchChecker.

        @arg email:         e-mail address of batch supplier
        @type email:        unicode
        @arg queue:         A list of jobs
        @type queue:        list
        @arg columns:       The number of columns.
        @type columns:      int
        @arg job_type:       The type of Batch Job that should be run
        @type job_type:
        @arg argument:          Batch Arguments, for now only build info
        @type argument:
        @arg create_download_url: Function accepting a result_id and returning
                                  the URL for downloading the batch job
                                  result. Can be None.
        @type create_download_url: function

        @return: result_id
        @rtype:
        """
        # Add jobs to the database
        batch_job = BatchJob(job_type, email=email, argument=argument)
        if create_download_url:
            batch_job.download_url = create_download_url(batch_job.result_id)
        session.add(batch_job)

        for i, inputl in enumerate(queue):
            # NOTE:
            # This is a very dirty way to skip entries before they are fed
            # to the batch processes. This is needed for e.g. an empty line
            # or because the File Module noticed wrong formatting. These lines
            # used to be discarded but are now preserved by the escape string.
            # The benefit of this is that the users input will match the
            # output in terms of input line and outputline.
            if inputl.startswith("~!"):  #Dirty Escape
                inputl = inputl[2:]
                if inputl:
                    flag = "S0"  # Flag for wrong format
                else:
                    flag = "S9"  # Flag for empty line
                    inputl = " "  #Database doesn't like an empty inputfield
            else:
                flag = None
            if (i + 1) % columns:
                # Add flag for continuing the current row
                flag = '%s%s' % (flag if flag else '', 'C0')

            item = BatchQueueItem(batch_job, inputl, flags=flag)
            session.add(item)

        session.commit()
        return batch_job.result_id
Esempio n. 3
0
    def _update_db_md5(self, raw_data, name, source):
        """
        :arg str raw_data:
        :arg unicode name:
        :arg unicode source:

        :returns: filename
        :rtype: unicode
        """
        # TODO: Documentation.
        try:
            reference = Reference.query.filter_by(accession=name).one()
            current_md5sum = reference.checksum
        except NoResultFound:
            current_md5sum = None

        if current_md5sum:
            md5sum = self._calculate_hash(raw_data)
            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        name, current_md5sum, md5sum))
                Reference.query.filter_by(accession=name).update(
                    {'checksum': md5sum})
                session.commit()
        else:
            reference = Reference(name, self._calculate_hash(raw_data), source)
            session.add(reference)
            session.commit()
        return self._name_to_file(name)
Esempio n. 4
0
    def uploadrecord(self, raw_data):
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg str raw_data: A GenBank record.

        :returns: Accession number for the uploaded file.
        :rtype: unicode
        """
        md5sum = self._calculate_hash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            ud = self._new_ud()
            if self.write(raw_data, ud, 0):
                reference = Reference(ud, md5sum)
                session.add(reference)
                session.commit()
                return ud
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                return reference.accession
            else:
                return (self.write(raw_data, reference.accession, 0) and
                        reference.accession)
Esempio n. 5
0
    def sync_with_remote(self, remote_wsdl, url_template,
                         days=DEFAULT_CREATED_SINCE_DAYS):
        """
        Synchronize the local cache with the remote cache.

        ::

            >>> wsdl = 'https://mutalyzer.nl/mutalyzer/services/?wsdl'
            >>> template = 'https://mutalyzer.nl/mutalyzer/Reference/{file}'
            >>> self.sync_with_remote(wsdl, template)
            (14, 3)

        :arg remote_wsdl: The url of the remote SOAP WSDL description.
        :type remote_wsdl: unicode
        :arg url_template: Formatting string containing a ``{file}``
          occurence, see example usage above.
        :string url_template: unicode
        :arg days: Only remote entries added this number of days ago or
          later are considered.
        :type days: int

        :return: The number of entries added to the local cache and the number
          cache files downloaded from the remote site.
        :rtype: tuple(int, int)
        """
        self._output.addMessage(__file__, -1, 'INFO', 'Starting cache sync')

        created_since = datetime.today() - timedelta(days=days)
        remote_cache = self.remote_cache(remote_wsdl, created_since)

        inserted = downloaded = 0

        for entry in remote_cache:
            try:
                reference = Reference.query.filter_by(accession=entry['name']).one()
                if reference.checksum is not None:
                    continue
            except NoResultFound:
                pass

            if Reference.query.filter_by(checksum=entry['hash']).count() > 0:
                continue

            reference = Reference(entry['name'], entry['hash'], entry['source'],
                                  source_data=entry['source_data'])
            session.add(reference)
            session.commit()
            inserted += 1
            if entry['source'] == 'upload' and entry['cached']:
                url = url_template.format(file=entry['cached'])
                self.store_remote_file(entry['name'], url)
                downloaded += 1

        self._output.addMessage(__file__, -1, 'INFO',
                                'Inserted %d entries in the cache,'
                                ' downloaded %d files.' \
                                % (inserted, downloaded))
        self._output.addMessage(__file__, -1, 'INFO', 'Finished cache sync')

        return inserted, downloaded
Esempio n. 6
0
    def uploadrecord(self, raw_data):
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg str raw_data: A GenBank record.

        :returns: Accession number for the uploaded file.
        :rtype: unicode
        """
        md5sum = self._calculate_hash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            ud = self._new_ud()
            if self.write(raw_data, ud, 0):
                reference = Reference(ud, md5sum, 'upload')
                session.add(reference)
                session.commit()
                return ud
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                return reference.accession
            else:
                return (self.write(raw_data, reference.accession, 0)
                        and reference.accession)
Esempio n. 7
0
    def uploadrecord(self, raw_data) :
        """
        Write an uploaded record to a file.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg raw_data: A GenBank record.
        @type raw_data: byte string

        @return: Accession number for the uploaded file.
        @rtype: unicode
        """
        md5sum = self._calcHash(raw_data)

        try:
            reference = Reference.query.filter_by(checksum=md5sum).one()
        except NoResultFound:
            UD = self._newUD()
            if self.write(raw_data, UD, 0):
                reference = Reference(UD, md5sum)
                session.add(reference)
                session.commit()
                return UD
        else:
            if os.path.isfile(self._nametofile(reference.accession)):
                return reference.accession
            else:
                return self.write(raw_data, reference.accession, 0) and reference.accession
Esempio n. 8
0
    def _update_db_md5(self, raw_data, name, gi):
        """
        :arg str raw_data:
        :arg unicode name:
        :arg unicode gi:

        :returns: filename
        :rtype: unicode
        """
        # TODO: Documentation.
        try:
            reference = Reference.query.filter_by(accession=name).one()
            current_md5sum = reference.checksum
        except NoResultFound:
            current_md5sum = None

        if current_md5sum:
            md5sum = self._calculate_hash(raw_data)
            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        name, current_md5sum, md5sum))
                Reference.query.filter_by(accession=name).update(
                    {'checksum': md5sum})
                session.commit()
        else:
            reference = Reference(
                name, self._calculate_hash(raw_data), geninfo_identifier=gi)
            session.add(reference)
            session.commit()
        return self._name_to_file(name)
Esempio n. 9
0
    def downloadrecord(self, url):
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg unicode url: Location of a GenBank record.

        :returns: UD or None.
        :rtype: unicode
        """
        if not (url.startswith('http://') or url.startswith('https://')
                or url.startswith('ftp://')):
            self._output.addMessage(
                __file__, 4, 'ERECPARSE',
                'Only HTTP(S) or FTP locations are allowed.')
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info.gettype() == 'text/plain':
            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calculate_hash(raw_data)

                ud = None
                try:
                    reference = Reference.query.filter_by(
                        checksum=md5sum).one()
                except NoResultFound:
                    ud = self._new_ud()
                    if not os.path.isfile(self._name_to_file(ud)):
                        ud = self.write(raw_data, ud, 0) and ud
                    if ud:
                        # Parsing went OK, add to DB.
                        reference = Reference(ud,
                                              md5sum,
                                              source='url',
                                              source_data=url)
                        session.add(reference)
                        session.commit()
                else:
                    if (os.path.isfile(self._name_to_file(reference.accession))
                            or self.write(raw_data, reference.accession, 0)):
                        ud = reference.accession

                # Returns the UD or None.
                return ud
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
                return None
        else:
            self._output.addMessage(__file__, 4, 'ERECPARSE',
                                    'This is not a GenBank record.')
            return None
Esempio n. 10
0
    def downloadrecord(self, url):
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        :arg unicode url: Location of a GenBank record.

        :returns: UD or None.
        :rtype: unicode
        """
        if not (url.startswith('http://') or url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(
                __file__, 4, 'ERECPARSE',
                'Only HTTP(S) or FTP locations are allowed.')
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info['Content-Type'] == 'text/plain':
            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calculate_hash(raw_data)

                ud = None
                try:
                    reference = Reference.query.filter_by(
                        checksum=md5sum).one()
                except NoResultFound:
                    ud = self._new_ud()
                    if not os.path.isfile(self._name_to_file(ud)):
                        ud = self.write(raw_data, ud, 0) and ud
                    if ud:
                        # Parsing went OK, add to DB.
                        reference = Reference(ud, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(
                            self._name_to_file(reference.accession)):
                        ud = (self.write(raw_data, reference.accession, 0) and
                              reference.accession)

                # Returns the UD or None.
                return ud
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
                return None
        else:
            self._output.addMessage(
                __file__, 4, 'ERECPARSE', 'This is not a GenBank record.')
            return None
Esempio n. 11
0
    def downloadrecord(self, url) :
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg url: Location of a GenBank record
        @type url: unicode

        @return: UD or None
        @rtype: unicode
        """
        if not (url.startswith('http://') or
                url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                    "Only HTTP(S) or FTP locations are allowed.")
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calcHash(raw_data)

                UD = None

                try:
                    reference = Reference.query.filter_by(checksum=md5sum).one()
                except NoResultFound:
                    UD = self._newUD()
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                    if UD:      #Parsing went OK, add to DB
                        reference = Reference(UD, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(self._nametofile(reference.accession)):
                        UD = self.write(raw_data, reference.accession, 0) and reference.accession

                return UD #Returns the UD or None
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
                return None
            #else
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not a GenBank record.")
            return None
Esempio n. 12
0
    def downloadrecord(self, url) :
        """
        Download a GenBank record from a URL.
        If the downloaded file is recognised by its hash, the old UD number
        is used.

        @arg url: Location of a GenBank record
        @type url: unicode

        @return: UD or None
        @rtype: unicode
        """
        if not (url.startswith('http://') or
                url.startswith('https://') or
                url.startswith('ftp://')):
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                    "Only HTTP(S) or FTP locations are allowed.")
            return None

        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "text/plain" :
            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                md5sum = self._calcHash(raw_data)

                UD = None

                try:
                    reference = Reference.query.filter_by(checksum=md5sum).one()
                except NoResultFound:
                    UD = self._newUD()
                    if not os.path.isfile(self._nametofile(UD)):
                        UD = self.write(raw_data, UD, 0) and UD
                    if UD:      #Parsing went OK, add to DB
                        reference = Reference(UD, md5sum, download_url=url)
                        session.add(reference)
                        session.commit()
                else:
                    if not os.path.isfile(self._nametofile(reference.accession)):
                        UD = self.write(raw_data, reference.accession, 0) and reference.accession

                return UD #Returns the UD or None
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
                return None
            #else
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not a GenBank record.")
            return None
Esempio n. 13
0
def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='******',
                                 host='genome-mysql.cse.ucsc.edu',
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
    parameters = gene,

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(chromosome,
                                                     'refseq',
                                                     acc,
                                                     geneName,
                                                     orientation,
                                                     txStart + 1,
                                                     txEnd,
                                                     exon_starts,
                                                     exon_stops,
                                                     'ucsc',
                                                     cds=cds,
                                                     version=int(version))
        session.add(mapping)

    session.commit()
Esempio n. 14
0
    def addJob(self, email, queue, columns, job_type, argument=None,
               create_download_url=None):
        """
        Add a job to the Database and start the BatchChecker.

        @arg email:         e-mail address of batch supplier
        @type email:        unicode
        @arg queue:         A list of jobs
        @type queue:        list
        @arg columns:       The number of columns.
        @type columns:      int
        @arg job_type:       The type of Batch Job that should be run
        @type job_type:
        @arg argument:          Batch Arguments, for now only build info
        @type argument:
        @arg create_download_url: Function accepting a result_id and returning
                                  the URL for downloading the batch job
                                  result. Can be None.
        @type create_download_url: function

        @return: result_id
        @rtype:
        """
        # Add jobs to the database
        batch_job = BatchJob(job_type, email=email, argument=argument)
        if create_download_url:
            batch_job.download_url = create_download_url(batch_job.result_id)
        session.add(batch_job)

        for i, inputl in enumerate(queue):
            # NOTE:
            # This is a very dirty way to skip entries before they are fed
            # to the batch processes. This is needed for e.g. an empty line
            # or because the File Module noticed wrong formatting. These lines
            # used to be discarded but are now preserved by the escape string.
            # The benefit of this is that the users input will match the
            # output in terms of input line and outputline.
            if inputl.startswith("~!"): #Dirty Escape
                inputl = inputl[2:]
                if inputl:
                    flag = "S0"     # Flag for wrong format
                else:
                    flag = "S9"     # Flag for empty line
                    inputl = " " #Database doesn't like an empty inputfield
            else:
                flag = None
            if (i + 1) % columns:
                # Add flag for continuing the current row
                flag = '%s%s' % (flag if flag else '', 'C0')

            item = BatchQueueItem(batch_job, inputl, flags=flag)
            session.add(item)

        session.commit()
        return batch_job.result_id
Esempio n. 15
0
def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome,
            'refseq',
            record.source_accession,
            gene.name,
            orientation,
            start,
            stop, [start], [stop],
            'reference',
            cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()
Esempio n. 16
0
def hg19():
    """
    Fixture for GRCh37/hg19 genome assembly with chromosomes.
    """
    assembly = Assembly('GRCh37', 9606, 'H**o sapiens', alias='hg19')
    session.add(assembly)

    session.add_all(
        Chromosome(assembly, name, accession, organelle)
        for accession, name, organelle in
        [('NC_000001.10', 'chr1',
          'nucleus'), ('NC_000002.11', 'chr2',
                       'nucleus'), ('NC_000003.11', 'chr3', 'nucleus'),
         ('NC_000004.11', 'chr4',
          'nucleus'), ('NC_000005.9', 'chr5',
                       'nucleus'), ('NC_000006.11', 'chr6', 'nucleus'),
         ('NC_000007.13', 'chr7',
          'nucleus'), ('NC_000008.10', 'chr8',
                       'nucleus'), ('NC_000009.11', 'chr9', 'nucleus'),
         ('NC_000010.10', 'chr10',
          'nucleus'), ('NC_000011.9', 'chr11',
                       'nucleus'), ('NC_000012.11', 'chr12', 'nucleus'),
         ('NC_000013.10', 'chr13',
          'nucleus'), ('NC_000014.8', 'chr14',
                       'nucleus'), ('NC_000015.9', 'chr15', 'nucleus'),
         ('NC_000016.9', 'chr16',
          'nucleus'), ('NC_000017.10', 'chr17',
                       'nucleus'), ('NC_000018.9', 'chr18', 'nucleus'),
         ('NC_000019.9', 'chr19',
          'nucleus'), ('NC_000020.10', 'chr20',
                       'nucleus'), ('NC_000021.8', 'chr21', 'nucleus'),
         ('NC_000022.10', 'chr22',
          'nucleus'), ('NC_000023.10', 'chrX',
                       'nucleus'), ('NC_000024.9', 'chrY', 'nucleus'),
         ('NT_167244.1', 'chr6_apd_hap1',
          'nucleus'), ('NT_113891.2', 'chr6_cox_hap2',
                       'nucleus'), ('NT_167245.1', 'chr6_dbb_hap3', 'nucleus'),
         ('NT_167246.1', 'chr6_mann_hap4',
          'nucleus'), ('NT_167247.1', 'chr6_mcf_hap5',
                       'nucleus'), ('NT_167248.1', 'chr6_qbl_hap6', 'nucleus'),
         ('NT_167249.1', 'chr6_ssto_hap7',
          'nucleus'), ('NT_167250.1', 'chr4_ctg9_hap1',
                       'nucleus'), (
                           'NT_167251.1', 'chr17_ctg5_hap1',
                           'nucleus'), ('NC_012920.1', 'chrM',
                                        'mitochondrion')])

    session.commit()
Esempio n. 17
0
def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='******',
                                 host='genome-mysql.cse.ucsc.edu',
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
    parameters = gene,

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', acc, geneName, orientation, txStart + 1,
            txEnd, exon_starts, exon_stops, 'ucsc', cds=cds,
            version=int(version))
        session.add(mapping)

    session.commit()
Esempio n. 18
0
def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', record.source_accession, gene.name,
            orientation, start, stop, [start], [stop], 'reference', cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()
Esempio n. 19
0
def hg19():
    """
    Fixture for GRCh37/hg19 genome assembly with chromosomes.
    """
    assembly = Assembly('GRCh37', 9606, 'H**o sapiens', alias='hg19')
    session.add(assembly)

    session.add_all(Chromosome(assembly, name, accession, organelle)
                    for accession, name, organelle in [
            ('NC_000001.10', 'chr1', 'nucleus'),
            ('NC_000002.11', 'chr2', 'nucleus'),
            ('NC_000003.11', 'chr3', 'nucleus'),
            ('NC_000004.11', 'chr4', 'nucleus'),
            ('NC_000005.9', 'chr5', 'nucleus'),
            ('NC_000006.11', 'chr6', 'nucleus'),
            ('NC_000007.13', 'chr7', 'nucleus'),
            ('NC_000008.10', 'chr8', 'nucleus'),
            ('NC_000009.11', 'chr9', 'nucleus'),
            ('NC_000010.10', 'chr10', 'nucleus'),
            ('NC_000011.9', 'chr11', 'nucleus'),
            ('NC_000012.11', 'chr12', 'nucleus'),
            ('NC_000013.10', 'chr13', 'nucleus'),
            ('NC_000014.8', 'chr14', 'nucleus'),
            ('NC_000015.9', 'chr15', 'nucleus'),
            ('NC_000016.9', 'chr16', 'nucleus'),
            ('NC_000017.10', 'chr17', 'nucleus'),
            ('NC_000018.9', 'chr18', 'nucleus'),
            ('NC_000019.9', 'chr19', 'nucleus'),
            ('NC_000020.10', 'chr20', 'nucleus'),
            ('NC_000021.8', 'chr21', 'nucleus'),
            ('NC_000022.10', 'chr22', 'nucleus'),
            ('NC_000023.10', 'chrX', 'nucleus'),
            ('NC_000024.9', 'chrY', 'nucleus'),
            ('NT_167244.1', 'chr6_apd_hap1', 'nucleus'),
            ('NT_113891.2', 'chr6_cox_hap2', 'nucleus'),
            ('NT_167245.1', 'chr6_dbb_hap3', 'nucleus'),
            ('NT_167246.1', 'chr6_mann_hap4', 'nucleus'),
            ('NT_167247.1', 'chr6_mcf_hap5', 'nucleus'),
            ('NT_167248.1', 'chr6_qbl_hap6', 'nucleus'),
            ('NT_167249.1', 'chr6_ssto_hap7', 'nucleus'),
            ('NT_167250.1', 'chr4_ctg9_hap1', 'nucleus'),
            ('NT_167251.1', 'chr17_ctg5_hap1', 'nucleus'),
            ('NC_012920.1', 'chrM', 'mitochondrion')])

    session.commit()
Esempio n. 20
0
def update_transcript_protein_link(transcript_accession,
                                   protein_accession=None):
    """
    Update cached link between a transcript and a protein, or create it if it
    doesn't exist yet.
    """
    link = TranscriptProteinLink.query \
        .filter_by(transcript_accession=transcript_accession) \
        .first()

    if link is not None:
        link.protein_accession = protein_accession
        link.added = datetime.now()
    else:
        link = TranscriptProteinLink(transcript_accession, protein_accession)
        session.add(link)

    session.commit()
Esempio n. 21
0
    def cache_with_references():
        for reference in references:
            entry = REFERENCES[reference]
            try:
                accession = entry['accession']
            except KeyError:
                accession = reference
            geninfo_id = entry.get('geninfo_id')

            path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'data',
                                entry['filename'])
            shutil.copy(path, settings.CACHE_DIR)

            session.add(Reference(accession, entry['checksum'],
                                  geninfo_identifier=geninfo_id))

            for transcript, protein in entry.get('links', []):
                session.add(TranscriptProteinLink(transcript, protein))

        session.commit()
Esempio n. 22
0
    def cache_with_references():
        for reference in references:
            entry = REFERENCES[reference]
            try:
                accession = entry['accession']
            except KeyError:
                accession = reference
            geninfo_id = entry.get('geninfo_id')

            path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'data', entry['filename'])
            shutil.copy(path, settings.CACHE_DIR)

            session.add(
                Reference(accession,
                          entry['checksum'],
                          geninfo_identifier=geninfo_id))

            for transcript, protein in entry.get('links', []):
                session.add(TranscriptProteinLink(transcript, protein))

        session.commit()
Esempio n. 23
0
    def _updateDBmd5(self, raw_data, name, GI):
        #TODO documentation
        """
        @todo: documentation

        @arg raw_data:
        @type raw_data:
        @arg name:
        @type name:
        @arg GI:
        @type GI:

        @return: filename
        @rtype: unicode
        """
        try:
            reference = Reference.query.filter_by(accession=name).one()
            currentmd5sum = reference.checksum
        except NoResultFound:
            currentmd5sum = None

        if currentmd5sum :
            md5sum = self._calcHash(raw_data)
            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    name, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=name).update({'checksum': md5sum})
                session.commit()
            #if
        else :
            reference = Reference(name, self._calcHash(raw_data),
                                  geninfo_identifier=GI)
            session.add(reference)
            session.commit()
        return self._nametofile(name)
Esempio n. 24
0
    def _updateDBmd5(self, raw_data, name, GI):
        #TODO documentation
        """
        @todo: documentation

        @arg raw_data:
        @type raw_data:
        @arg name:
        @type name:
        @arg GI:
        @type GI:

        @return: filename
        @rtype: unicode
        """
        try:
            reference = Reference.query.filter_by(accession=name).one()
            currentmd5sum = reference.checksum
        except NoResultFound:
            currentmd5sum = None

        if currentmd5sum :
            md5sum = self._calcHash(raw_data)
            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    name, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=name).update({'checksum': md5sum})
                session.commit()
            #if
        else :
            reference = Reference(name, self._calcHash(raw_data),
                                  geninfo_identifier=GI)
            session.add(reference)
            session.commit()
        return self._nametofile(name)
Esempio n. 25
0
def import_from_mapview_file(assembly, mapview_file, group_label):
    """
    Import transcript mappings from an NCBI mapview file.

    We require that this file is first sorted on the `feature_id` column
    (#11), which always contains the gene identifier, and then on the
    `chromosome` column (#2).

        sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md

    Raises :exc:`ValueError` if `mapview_file` is not sorted this way.

    The NCBI mapping file consists of entries, one per line, in order of
    their location in the genome (more specifically by start location).
    Every entry has a 'group_label' column, denoting the assembly it is
    from. We only use entries where this value is `group_label`.

    There are four types of entries (for our purposes):
    - Gene: Name, identifier, and location of a gene.
    - Transcript: Name, gene id, and location of a transcript.
    - UTR: Location and transcript of a non-coding exon (or part of it).
    - CDS: Location and transcript of a coding exon (or part of it).

    A bit troublesome for us is that exons are split in UTR exons and CDS
    exons, with exons overlapping the UTR/CDS border defined as two
    separate entries (one of type UTR and one of type CDS).

    Another minor annoyance is that some transcripts (~ 15) are split over
    two contigs (NT_*). In that case, they are defined by two entries in
    the file, where we should merge them by taking the start position of
    the first and the stop position of the second.

    To complicate this annoyance, some genes (e.g. in the PAR) are mapped
    on both the X and Y chromosomes, but stored in the file just like the
    transcripts split over two contigs. However, these ones should of
    course not be merged.

    Our strategy is too sort by gene and chromosome and process the file
    grouped by these two fields.

    For transcripts without any UTR and CDS entries (seems to happen for
    predicted genes), we generate one exon spanning the entire transcript.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = ['taxonomy', 'chromosome', 'start', 'stop', 'orientation',
               'contig', 'ctg_start', 'ctg_stop', 'ctg_orientation',
               'feature_name', 'feature_id', 'feature_type', 'group_label',
               'transcript', 'evidence_code']

    chromosomes = assembly.chromosomes.all()

    def read_records(mapview_file):
        for line in mapview_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip().split('\t')))

            # Only use records from the given assembly.
            if record['group_label'] != group_label:
                continue

            # Only use records on chromosomes we know.
            try:
                record['chromosome'] = next(c for c in chromosomes if
                                            c.name == 'chr' + record['chromosome'])
            except StopIteration:
                continue

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])

            yield record

    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
        by_transcript = defaultdict(lambda: defaultdict(list))
        for r in records:
            by_transcript[r['transcript']][r['feature_type']].append(r)

        gene = by_transcript['-']['GENE'][0]['feature_name']

        for transcript, by_type in by_transcript.items():
            if transcript == '-':
                continue
            accession, version = transcript.split('.')
            version = int(version)
            chromosome = by_type['RNA'][0]['chromosome']
            orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward'
            start = min(t['start'] for t in by_type['RNA'])
            stop = max(t['stop'] for t in by_type['RNA'])

            exon_starts = []
            exon_stops = []
            cds_positions = []
            for exon in sorted(by_type['UTR'] + by_type['CDS'],
                               key=itemgetter('start')):
                if exon_stops and exon_stops[-1] > exon['start'] - 1:
                    # This exon starts before the end of the previous exon. We
                    # have no idea what to do in this case, so we ignore it.
                    # The number of transcripts affected is very small (e.g.,
                    # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly).
                    continue
                if exon['feature_type'] == 'CDS':
                    cds_positions.extend([exon['start'], exon['stop']])
                if exon_stops and exon_stops[-1] == exon['start'] - 1:
                    # This exon must be merged with the previous one because
                    # it is split over two entries (a CDS part and a UTR part
                    # or split over different reference contigs).
                    exon_stops[-1] = exon['stop']
                else:
                    exon_starts.append(exon['start'])
                    exon_stops.append(exon['stop'])

            if cds_positions:
                cds = min(cds_positions), max(cds_positions)
            else:
                cds = None

            # If no exons are annotated, we create one spanning the entire
            # transcript.
            if not exon_starts:
                exon_starts = [start]
                exon_stops = [stop]

            yield TranscriptMapping.create_or_update(
                chromosome, 'refseq', accession, gene, orientation, start,
                stop, exon_starts, exon_stops, 'ncbi', cds=cds,
                version=version)

    processed_keys = set()

    for key, records in groupby(read_records(mapview_file),
                                itemgetter('feature_id', 'chromosome')):
        if key in processed_keys:
            raise MapviewSortError('Mapview file must be sorted by feature_id '
                                   'and chromosome (try `sort -k 11,11 -k '
                                   '2,2`)')
        processed_keys.add(key)

        for mapping in build_mappings(records):
            session.add(mapping)

    session.commit()
Esempio n. 26
0
    def retrieveslice(self, accno, start, stop, orientation) :
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        @arg accno: The accession number of the chromosome
        @type accno: unicode
        @arg start: Start position of the slice
        @type start: integer
        @arg stop: End position of the slice.
        @type stop: integer
        @arg orientation:
        Orientation of the slice:
            - 1 ; Forward
            - 2 ; Reverse complement
        @type orientation: integer

        @return: An UD number
        @rtype: unicode
        """

        # Not a valid slice.
        if start >= stop :
            return None

        # The slice can not be too big.
        if stop - start > settings.MAX_FILE_SIZE:
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._nametofile(reference.accession)) : # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text',
                                   id=accno, seq_start=start, seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Error connecting to Entrez nuccore database: %s' % unicode(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calcHash(raw_data)

        if reference is not None: # We have seen this one before.
            currentmd5sum = reference.checksum

            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    reference.accession, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum})
                session.commit()
            #if
        else : # We haven't seen it before, so give it a name.
            UD = self._newUD()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(UD, md5sum, slice_accession=accno,
                                  slice_start=start, slice_stop=stop,
                                  slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()
        #else

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Esempio n. 27
0
def hg19_transcript_mappings():
    """
    Fixture for some selected transcript mappings in the GRCh37/hg19 genome
    assembly. Depends on the :func:`hg19` fixture.
    """
    chromosome_1 = Chromosome.query.filter_by(accession='NC_000001.10').one()
    chromosome_3 = Chromosome.query.filter_by(accession='NC_000003.11').one()
    chromosome_6 = Chromosome.query.filter_by(accession='NC_000006.11').one()
    chromosome_7 = Chromosome.query.filter_by(accession='NC_000007.13').one()
    chromosome_8 = Chromosome.query.filter_by(accession='NC_000008.10').one()
    chromosome_11 = Chromosome.query.filter_by(accession='NC_000011.9').one()
    chromosome_20 = Chromosome.query.filter_by(accession='NC_000020.10').one()
    chromosome_22 = Chromosome.query.filter_by(accession='NC_000022.10').one()
    chromosome_x = Chromosome.query.filter_by(accession='NC_000023.10').one()
    chromosome_mt = Chromosome.query.filter_by(accession='NC_012920.1').one()

    session.add_all([chromosome_1, chromosome_6, chromosome_8, chromosome_11,
                     chromosome_20, chromosome_22, chromosome_mt])

    session.add(TranscriptMapping(
            chromosome_11,
            'refseq',
            'NM_003002',
            'SDHD',
            'forward',
            111957571,
            111966518,
            [111957571, 111958581, 111959591, 111965529],
            [111957683, 111958697, 111959735, 111966518],
            'ncbi',
            transcript=1,
            cds=(111957632, 111965694),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_11,
            'refseq',
            'NM_012459',
            'TIMM8B',
            'reverse',
            111955524,
            111957522,
            [111955524, 111957364],
            [111956186, 111957522],
            'ncbi',
            transcript=1,
            cds=(111956019, 111957492),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_11,
            'refseq',
            'NR_028383',
            'TIMM8B',
            'reverse',
            111955524,
            111957522,
            [111955524, 111956702, 111957364],
            [111956186, 111957034, 111957522],
            'ncbi',
            transcript=1,
            cds=None,
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_6,
            'refseq',
            'NM_000500',
            'CYP21A2',
            'forward',
            32006082,
            32009419,
            [32006082, 32006499, 32006871, 32007133, 32007323, 32007526,
             32007782, 32008183, 32008445, 32008646],
            [32006401, 32006588, 32007025, 32007234, 32007424, 32007612,
             32007982, 32008361, 32008548, 32009419],
            'ncbi',
            transcript=1,
            cds=(32006200, 32008911),
            select_transcript=False,
            version=5))
    session.add(TranscriptMapping(
            chromosome_22,
            'refseq',
            'NM_001145134',
            'CPT1B',
            'reverse',
            51007290,
            51017096,
            [51007290, 51007765, 51008005, 51008722, 51009320, 51009587,
             51009804, 51010435, 51010632, 51011304, 51011949, 51012764,
             51012922, 51014464, 51014627, 51015286, 51015753, 51016204,
             51016978],
            [51007510, 51007850, 51008097, 51008835, 51009472, 51009721,
             51009968, 51010551, 51010737, 51011489, 51012144, 51012848,
             51013029, 51014541, 51014764, 51015463, 51015892, 51016363,
             51017096],
            'ncbi',
            transcript=1,
            cds=(51007767, 51016344),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_22,
            'refseq',
            'NR_021492',
            'LOC100144603',
            'forward',
            51021455,
            51022356,
            [51021455, 51022027],
            [51021752, 51022356],
            'ncbi',
            transcript=1,
            cds=None,
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001007553',
            'CSDE1',
            'reverse',
            115259538,
            115300624,
            [115259538, 115261234, 115262200, 115263160, 115266504, 115267842,
             115268832, 115269604, 115272879, 115273129, 115275225, 115276353,
             115276610, 115277063, 115279379, 115280092, 115280584, 115282313,
             115292442, 115300546],
            [115260837, 115261366, 115262363, 115263338, 115266623, 115267954,
             115269007, 115269711, 115273043, 115273269, 115275437, 115276478,
             115276738, 115277144, 115279476, 115280184, 115280693, 115282511,
             115292828, 115300624],
            'ncbi',
            transcript=1,
            cds=(115260790, 115282511),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001130523',
            'CSDE1',
            'reverse',
            115259538,
            115300671,
            [115259538, 115261234, 115262200, 115263160, 115266504, 115267842,
             115268832, 115269604, 115272879, 115273129, 115275225, 115276353,
             115276610, 115277063, 115279379, 115280584, 115282313, 115284148,
             115292442, 115300546],
            [115260837, 115261366, 115262363, 115263338, 115266623, 115267954,
             115269007, 115269711, 115273043, 115273269, 115275437, 115276478,
             115276738, 115277144, 115279476, 115280693, 115282511, 115284294,
             115292828, 115300671],
            'ncbi',
            transcript=1,
            cds=(115260790, 115284285),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_002241',
            'KCNJ10',
            'reverse',
            160007257,
            160040051,
            [160007257, 160039812],
            [160012322, 160040051],
            'ncbi',
            transcript=1,
            cds=(160011183, 160012322),
            select_transcript=False,
            version=4))
    session.add(TranscriptMapping(
            chromosome_20,
            'refseq',
            'NM_001162505',
            'TMEM189',
            'reverse',
            48740274,
            48770335,
            [48740274, 48744512, 48746083, 48747402, 48760039, 48770054],
            [48741716, 48744724, 48746227, 48747484, 48760158, 48770335],
            'ncbi',
            transcript=1,
            cds=(48741595, 48770174),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_8,
            'refseq',
            'NM_017780',
            'CHD7',
            'forward',
            61591339,
            61779465,
            [61591339, 61653818, 61693559, 61707545, 61712947, 61714087,
             61720776, 61728946, 61732566, 61734349, 61734583, 61735062,
             61736399, 61741222, 61742881, 61748632, 61749376, 61750227,
             61750635, 61754203, 61754406, 61757423, 61757809, 61761074,
             61761610, 61763052, 61763591, 61763821, 61764578, 61765057,
             61765388, 61766922, 61768534, 61769004, 61773463, 61774755,
             61775107, 61777575],
            [61591641, 61655656, 61693989, 61707686, 61713084, 61714152,
             61720831, 61729060, 61732649, 61734486, 61734704, 61735305,
             61736575, 61741365, 61743136, 61748842, 61749571, 61750394,
             61750814, 61754313, 61754611, 61757622, 61757968, 61761163,
             61761713, 61763181, 61763663, 61763878, 61764806, 61765265,
             61766059, 61767082, 61768761, 61769447, 61773684, 61774895,
             61775211, 61779465],
            'ncbi',
            transcript=1,
            cds=(61653992, 61778492),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_mt,
            'refseq',
            'NC_012920',
            'ND4',
            'forward',
            10760,
            12137,
            [10760],
            [12137],
            'reference',
            transcript=1,
            cds=(10760, 12137),
            select_transcript=True,
            version=1))
    session.add(TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_002001',
            'FCER1A',
            'forward',
            159259504,
            159278014,
            [159259504, 159272096, 159272644, 159273718, 159275778, 159277538],
            [159259543, 159272209, 159272664, 159273972, 159276035, 159278014],
            'ncbi',
            transcript=1,
            cds=(159272155, 159277722),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_7,
            'refseq',
            'XM_001715131',
            'LOC100132858',
            'reverse',
            19828,
            36378,
            [19828, 20834, 31060, 32957, 35335, 36224],
            [19895, 21029, 31437, 33107, 35541, 36378],
            'ncbi',
            transcript=1,
            cds=(19828, 36378),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004011',
            'DMD',
            'reverse',
            31137345,
            32430371,
            [31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
             31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
             31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
             31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
             31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
             31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
             32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
             32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
             32408188, 32429869, 32430279],
            [31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
             31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
             31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
             31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
             31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
             31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
             32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
             32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
             32408298, 32430030, 32430371],
            'ncbi',
            transcript=1,
            cds=(31140036, 32430326),
            select_transcript=False,
            version=3))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004019',
            'DMD',
            'reverse',
            31196312,
            31285024,
            [31196312, 31198487, 31200855, 31222078, 31224699, 31227615,
             31241164, 31279072, 31284927],
            [31196922, 31198598, 31201021, 31222235, 31224784, 31227816,
             31241238, 31279133, 31285024],
            'ncbi',
            transcript=1,
            cds=(31196782, 31284946),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004007',
            'DMD',
            'reverse',
            31137345,
            33038317,
            [31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
             31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
             31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
             31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
             31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
             31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
             32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
             32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
             32408188, 32429869, 32456358, 32459297, 32466573, 32472779,
             32481556, 32482703, 32486615, 32490281, 32503036, 32509394,
             32519872, 32536125, 32563276, 32583819, 32591647, 32591862,
             32613874, 32632420, 32662249, 32663081, 32715987, 32717229,
             32827610, 32834585, 32841412, 32862900, 32867845, 33038256],
            [31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
             31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
             31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
             31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
             31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
             31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
             32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
             32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
             32408298, 32430030, 32456507, 32459431, 32466755, 32472949,
             32481711, 32482816, 32486827, 32490426, 32503216, 32509635,
             32519959, 32536248, 32563451, 32583998, 32591754, 32591963,
             32613993, 32632570, 32662430, 32663269, 32716115, 32717410,
             32827728, 32834757, 32841504, 32862977, 32867937, 33038317],
            'ncbi',
            transcript=1,
            cds=(31140036, 32834745),
            select_transcript=False,
            version=2))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_203473',
            'PORCN',
            'forward',
            48367371,
            48379202,
            [48367371, 48368172, 48369683, 48370280, 48370714, 48370977,
             48371223, 48372628, 48372913, 48374105, 48374278, 48374449,
             48375571, 48378763],
            [48367491, 48368344, 48369875, 48370323, 48370895, 48371107,
             48371240, 48372753, 48373013, 48374181, 48374341, 48374534,
             48375681, 48379202],
            'ncbi',
            transcript=1,
            cds=(48368209, 48378864),
            select_transcript=False,
            version=1))
    session.add(TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_000132',
            'F8',
            'reverse',
            154064063,
            154250998,
            [154064063, 154088707, 154089993, 154091358, 154124352, 154128141,
             154129646, 154130326, 154132181, 154132571, 154133086, 154134695,
             154156846, 154175973, 154182167, 154185232, 154189350, 154194245,
             154194701, 154197606, 154212962, 154215512, 154221211, 154225248,
             154227754, 154250685],
            [154066027, 154088883, 154090141, 154091502, 154124507, 154128226,
             154129717, 154130442, 154132363, 154132799, 154133298, 154134848,
             154159951, 154176182, 154182317, 154185446, 154189443, 154194416,
             154194962, 154197827, 154213078, 154215580, 154221423, 154225370,
             154227875, 154250998],
            'ncbi',
            transcript=1,
            cds=(154065872, 154250827),
            select_transcript=False,
            version=3))
    session.add(TranscriptMapping(
            chromosome_3,
            'refseq',
            'NM_000249',
            'MLH1',
            'forward',
            37034841,
            37092337,
            [37034841, 37038110, 37042446, 37045892, 37048482, 37050305,
             37053311, 37053502, 37055923, 37058997, 37061801, 37067128,
             37070275, 37081677, 37083759, 37089010, 37090008, 37090395,
             37091977],
            [37035154, 37038200, 37042544, 37045965, 37048554, 37050396,
             37053353, 37053590, 37056035, 37059090, 37061954, 37067498,
             37070423, 37081785, 37083822, 37089174, 37090100, 37090508,
             37092337],
            'ncbi',
            transcript=1,
            cds=(37035039, 37092144),
            select_transcript=False,
            version=3))

    session.commit()
Esempio n. 28
0
    def downloadrecord(self, url, name=None):
        """
        Download an LRG record from an URL.

        :arg unicode url: Location of the LRG record.

        :returns: The full path to the file or Nonein case of failure.
        :rtype: unicode
        """
        lrg_id = name or os.path.splitext(os.path.split(url)[1])[0]
        # if not lrg_id.startswith('LRG'):
        #     return None
        filename = self._name_to_file(lrg_id)

        # TODO: Properly read the file contents to a unicode string and write
        # it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()

        if (info['Content-Type'] == 'application/xml' and
                'Content-length' in info):
            # Looks like a valid LRG file.

            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                # Do an md5 check.
                md5sum = self._calculate_hash(raw_data)
                try:
                    reference = Reference.query.filter_by(
                        accession=lrg_id).one()
                    md5_db = reference.checksum
                except NoResultFound:
                    md5_db = None

                if md5_db is None:
                    reference = Reference(lrg_id, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5_db:
                    # Hash has changed for the LRG ID.
                    self._output.addMessage(
                        __file__, -1, 'WHASH',
                        'Warning: Hash of {} changed from {} to {}.'.format(
                            lrg_id, md5_db, md5sum))
                    Reference.query.filter_by(accession=lrg_id).update(
                        {'checksum': md5sum})
                    session.commit()
                else:
                    # Hash the same as in db.
                    pass

                if not os.path.isfile(filename):
                    return self.write(raw_data, lrg_id)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
        else:
            self._output.addMessage(
                __file__, 4, 'ERECPARSE', 'This is not an LRG record.')
        handle.close()
Esempio n. 29
0
def import_from_lrgmap_file(assembly, lrgmap_file):
    """
    Import transcript mappings from an EBI LRG transcripts map file.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = ['transcript', 'gene', 'chromosome', 'strand', 'start', 'stop',
               'exons', 'protein', 'cds_start', 'cds_stop']

    chromosomes = assembly.chromosomes.all()

    def read_mappings(lrgmap_file):
        for line in lrgmap_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip('\r\n').split('\t')))

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])
            try:
                record['cds_start'] = int(record['cds_start'])
            except ValueError:
                record['cds_start'] = None
            try:
                record['cds_stop'] = int(record['cds_stop'])
            except ValueError:
                record['cds_stop'] = None
            record['exons'] = [[int(pos) for pos in exon.split('-')]
                               for exon in record['exons'].split(',')]

            try:
                yield build_mapping(record)
            except ValueError:
                pass

    def build_mapping(record):
        # Only use records on chromosomes we know.
        try:
            chromosome = next(c for c in chromosomes if
                              c.name == 'chr' + record['chromosome'])
        except StopIteration:
            raise ValueError()

        accession, transcript = record['transcript'].split('t')
        transcript = int(transcript)

        orientation = 'reverse' if record['strand'] == '-1' else 'forward'

        if record['cds_start']:
            cds = record['cds_start'], record['cds_stop']
        else:
            cds = None

        # TODO: Also take protein into account. For example, in LRG_321 (TP53)
        # some transcripts occur twice (with different CDSs and different
        # protein numbers).
        # https://github.com/mutalyzer/mutalyzer/issues/372
        return TranscriptMapping.create_or_update(
            chromosome, 'lrg', accession, record['gene'], orientation,
            record['start'], record['stop'],
            [start for start, _ in record['exons']],
            [stop for _, stop in record['exons']],
            'ebi', transcript=transcript, cds=cds, select_transcript=True)

    for mapping in read_mappings(lrgmap_file):
        session.add(mapping)

    session.commit()
Esempio n. 30
0
    def downloadrecord(self, url, name=None):
        """
        Download an LRG record from an URL.

        :arg unicode url: Location of the LRG record.

        :returns: The full path to the file or Nonein case of failure.
        :rtype: unicode
        """
        lrg_id = name or os.path.splitext(os.path.split(url)[1])[0]
        # if not lrg_id.startswith('LRG'):
        #     return None
        filename = self._name_to_file(lrg_id)

        # TODO: Properly read the file contents to a unicode string and write
        # it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()

        if (info['Content-Type'] == 'application/xml'
                and 'Content-length' in info):
            # Looks like a valid LRG file.

            length = int(info['Content-Length'])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                # Do an md5 check.
                md5sum = self._calculate_hash(raw_data)
                try:
                    reference = Reference.query.filter_by(
                        accession=lrg_id).one()
                    md5_db = reference.checksum
                except NoResultFound:
                    md5_db = None

                if md5_db is None:
                    # Note: The abstraction seems a bit off here, but we
                    # prefer to set `Reference.source` to `lrg` and not to
                    # `url`, since the former is more specific.
                    reference = Reference(lrg_id, md5sum, 'lrg')
                    session.add(reference)
                    session.commit()
                elif md5sum != md5_db:
                    # Hash has changed for the LRG ID.
                    self._output.addMessage(
                        __file__, -1, 'WHASH',
                        'Warning: Hash of {} changed from {} to {}.'.format(
                            lrg_id, md5_db, md5sum))
                    Reference.query.filter_by(accession=lrg_id).update(
                        {'checksum': md5sum})
                    session.commit()
                else:
                    # Hash the same as in db.
                    pass

                if not os.path.isfile(filename):
                    return self.write(raw_data, lrg_id)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            else:
                self._output.addMessage(
                    __file__, 4, 'EFILESIZE',
                    'Filesize is not within the allowed boundaries.')
        else:
            self._output.addMessage(__file__, 4, 'ERECPARSE',
                                    'This is not an LRG record.')
        handle.close()
Esempio n. 31
0
    def downloadrecord(self, url, name = None) :
        """
        Download an LRG record from an URL.

        @arg url: Location of the LRG record
        @type url: unicode

        @return:
            - filename    ; The full path to the file
            - None        ; in case of failure
        @rtype: unicode
        """

        lrgID = name or os.path.splitext(os.path.split(url)[1])[0]
        #if not lrgID.startswith("LRG"):
        #    return None
        filename = self._nametofile(lrgID)

        # Todo: Properly read the file contents to a unicode string and write
        #   it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "application/xml" and info.has_key("Content-length"):

            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                #Do an md5 check
                md5sum = self._calcHash(raw_data)
                try:
                    reference = Reference.query.filter_by(accession=lrgID).one()
                    md5db = reference.checksum
                except NoResultFound:
                    md5db = None

                if md5db is None:
                    reference = Reference(lrgID, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5db:       #hash has changed for the LRG ID
                    self._output.addMessage(__file__, -1, "WHASH",
                        "Warning: Hash of %s changed from %s to %s." % (
                        lrgID, md5db, md5sum))
                    Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum})
                    session.commit()
                else:                       #hash the same as in db
                    pass

                if not os.path.isfile(filename) :
                    return self.write(raw_data, lrgID)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not an LRG record.")
        handle.close()
Esempio n. 32
0
def import_from_lrgmap_file(assembly, lrgmap_file):
    """
    Import transcript mappings from an EBI LRG transcripts map file.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = [
        'transcript', 'gene', 'chromosome', 'strand', 'start', 'stop', 'exons',
        'protein', 'cds_start', 'cds_stop'
    ]

    chromosomes = assembly.chromosomes.all()

    def read_mappings(lrgmap_file):
        for line in lrgmap_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip('\r\n').split('\t')))

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])
            try:
                record['cds_start'] = int(record['cds_start'])
            except ValueError:
                record['cds_start'] = None
            try:
                record['cds_stop'] = int(record['cds_stop'])
            except ValueError:
                record['cds_stop'] = None
            record['exons'] = [[int(pos) for pos in exon.split('-')]
                               for exon in record['exons'].split(',')]

            try:
                yield build_mapping(record)
            except ValueError:
                pass

    def build_mapping(record):
        # Only use records on chromosomes we know.
        try:
            chromosome = next(c for c in chromosomes
                              if c.name == 'chr' + record['chromosome'])
        except StopIteration:
            raise ValueError()

        accession, transcript = record['transcript'].split('t')
        transcript = int(transcript)

        orientation = 'reverse' if record['strand'] == '-1' else 'forward'

        if record['cds_start']:
            cds = record['cds_start'], record['cds_stop']
        else:
            cds = None

        # TODO: Also take protein into account. For example, in LRG_321 (TP53)
        # some transcripts occur twice (with different CDSs and different
        # protein numbers).
        # https://github.com/mutalyzer/mutalyzer/issues/372
        return TranscriptMapping.create_or_update(
            chromosome,
            'lrg',
            accession,
            record['gene'],
            orientation,
            record['start'],
            record['stop'], [start for start, _ in record['exons']],
            [stop for _, stop in record['exons']],
            'ebi',
            transcript=transcript,
            cds=cds,
            select_transcript=True)

    for mapping in read_mappings(lrgmap_file):
        session.add(mapping)

    session.commit()
Esempio n. 33
0
    def retrieveslice(self, accno, start, stop, orientation):
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        :arg unicode accno: The accession number of the chromosome.
        :arg int start: Start position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int stop: End position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int orientation: Orientation of the slice:
            - 1 ; Forward.
            - 2 ; Reverse complement.

        :returns: An UD number.
        :rtype: unicode
        """
        # Not a valid slice.
        if start > stop:
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice for start '
                'position greater than stop position.')
            return None

        # The slice can not be too big.
        if stop - start + 1 > settings.MAX_FILE_SIZE:
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice (request '
                'exceeds maximum of %d bases)' % settings.MAX_FILE_SIZE)
            return None

        # Value of the Reference.source_data field for this slice.
        source_data = '{}:{}:{}:{}'.format(accno, start, stop,
                                           ['forward',
                                            'reverse'][orientation - 1])

        # Check whether we have seen this slice before.
        reference = Reference.query.filter_by(source='ncbi_slice',
                                              source_data=source_data).first()
        if reference and os.path.isfile(self._name_to_file(
                reference.accession)):
            # It's still present.
            return reference.accession

        # It's not present, so download it.
        try:
            # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
            # in reference orientation.
            handle = Entrez.efetch(db='nuccore',
                                   rettype='gbwithparts',
                                   retmode='text',
                                   id=accno,
                                   seq_start=start,
                                   seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(
                __file__, -1, 'INFO',
                'Error connecting to Entrez nuccore database: {}'.format(
                    unicode(e)))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calculate_hash(raw_data)

        if reference is not None:
            # We have seen this one before.
            current_md5sum = reference.checksum

            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        reference.accession, current_md5sum, md5sum))
                Reference.query.filter_by(
                    accession=reference.accession).update({'checksum': md5sum})
                session.commit()
        else:
            # We haven't seen it before, so give it a name.
            ud = self._new_ud()
            reference = Reference(ud,
                                  md5sum,
                                  source='ncbi_slice',
                                  source_data=source_data)
            session.add(reference)
            session.commit()

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Esempio n. 34
0
    def downloadrecord(self, url, name = None) :
        """
        Download an LRG record from an URL.

        @arg url: Location of the LRG record
        @type url: unicode

        @return:
            - filename    ; The full path to the file
            - None        ; in case of failure
        @rtype: unicode
        """

        lrgID = name or os.path.splitext(os.path.split(url)[1])[0]
        #if not lrgID.startswith("LRG"):
        #    return None
        filename = self._nametofile(lrgID)

        # Todo: Properly read the file contents to a unicode string and write
        #   it utf-8 encoded.
        handle = urllib2.urlopen(url)
        info = handle.info()
        if info["Content-Type"] == "application/xml" and info.has_key("Content-length"):

            length = int(info["Content-Length"])
            if 512 < length < settings.MAX_FILE_SIZE:
                raw_data = handle.read()
                handle.close()

                #Do an md5 check
                md5sum = self._calcHash(raw_data)
                try:
                    reference = Reference.query.filter_by(accession=lrgID).one()
                    md5db = reference.checksum
                except NoResultFound:
                    md5db = None

                if md5db is None:
                    reference = Reference(lrgID, md5sum, download_url=url)
                    session.add(reference)
                    session.commit()
                elif md5sum != md5db:       #hash has changed for the LRG ID
                    self._output.addMessage(__file__, -1, "WHASH",
                        "Warning: Hash of %s changed from %s to %s." % (
                        lrgID, md5db, md5sum))
                    Reference.query.filter_by(accession=lrgID).update({'checksum': md5sum})
                    session.commit()
                else:                       #hash the same as in db
                    pass

                if not os.path.isfile(filename) :
                    return self.write(raw_data, lrgID)
                else:
                    # This can only occur if synchronus calls to mutalyzer are
                    # made to recover a file that did not exist. Still leaves
                    # a window in between the check and the write.
                    return filename
            #if
            else :
                self._output.addMessage(__file__, 4, "EFILESIZE",
                    "Filesize is not within the allowed boundaries.")
        #if
        else :
            self._output.addMessage(__file__, 4, "ERECPARSE",
                                     "This is not an LRG record.")
        handle.close()
Esempio n. 35
0
def hg19_transcript_mappings():
    """
    Fixture for some selected transcript mappings in the GRCh37/hg19 genome
    assembly. Depends on the :func:`hg19` fixture.
    """
    chromosome_1 = Chromosome.query.filter_by(accession='NC_000001.10').one()
    chromosome_3 = Chromosome.query.filter_by(accession='NC_000003.11').one()
    chromosome_6 = Chromosome.query.filter_by(accession='NC_000006.11').one()
    chromosome_7 = Chromosome.query.filter_by(accession='NC_000007.13').one()
    chromosome_8 = Chromosome.query.filter_by(accession='NC_000008.10').one()
    chromosome_11 = Chromosome.query.filter_by(accession='NC_000011.9').one()
    chromosome_20 = Chromosome.query.filter_by(accession='NC_000020.10').one()
    chromosome_22 = Chromosome.query.filter_by(accession='NC_000022.10').one()
    chromosome_x = Chromosome.query.filter_by(accession='NC_000023.10').one()
    chromosome_mt = Chromosome.query.filter_by(accession='NC_012920.1').one()

    session.add_all([
        chromosome_1, chromosome_6, chromosome_8, chromosome_11, chromosome_20,
        chromosome_22, chromosome_mt
    ])

    session.add(
        TranscriptMapping(chromosome_11,
                          'refseq',
                          'NM_003002',
                          'SDHD',
                          'forward',
                          111957571,
                          111966518,
                          [111957571, 111958581, 111959591, 111965529],
                          [111957683, 111958697, 111959735, 111966518],
                          'ncbi',
                          transcript=1,
                          cds=(111957632, 111965694),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(chromosome_11,
                          'refseq',
                          'NM_012459',
                          'TIMM8B',
                          'reverse',
                          111955524,
                          111957522, [111955524, 111957364],
                          [111956186, 111957522],
                          'ncbi',
                          transcript=1,
                          cds=(111956019, 111957492),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(chromosome_11,
                          'refseq',
                          'NR_028383',
                          'TIMM8B',
                          'reverse',
                          111955524,
                          111957522, [111955524, 111956702, 111957364],
                          [111956186, 111957034, 111957522],
                          'ncbi',
                          transcript=1,
                          cds=None,
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(chromosome_6,
                          'refseq',
                          'NM_000500',
                          'CYP21A2',
                          'forward',
                          32006082,
                          32009419, [
                              32006082, 32006499, 32006871, 32007133, 32007323,
                              32007526, 32007782, 32008183, 32008445, 32008646
                          ], [
                              32006401, 32006588, 32007025, 32007234, 32007424,
                              32007612, 32007982, 32008361, 32008548, 32009419
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(32006200, 32008911),
                          select_transcript=False,
                          version=5))
    session.add(
        TranscriptMapping(chromosome_22,
                          'refseq',
                          'NM_001145134',
                          'CPT1B',
                          'reverse',
                          51007290,
                          51017096, [
                              51007290, 51007765, 51008005, 51008722, 51009320,
                              51009587, 51009804, 51010435, 51010632, 51011304,
                              51011949, 51012764, 51012922, 51014464, 51014627,
                              51015286, 51015753, 51016204, 51016978
                          ], [
                              51007510, 51007850, 51008097, 51008835, 51009472,
                              51009721, 51009968, 51010551, 51010737, 51011489,
                              51012144, 51012848, 51013029, 51014541, 51014764,
                              51015463, 51015892, 51016363, 51017096
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(51007767, 51016344),
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(chromosome_22,
                          'refseq',
                          'NR_021492',
                          'LOC100144603',
                          'forward',
                          51021455,
                          51022356, [51021455, 51022027], [51021752, 51022356],
                          'ncbi',
                          transcript=1,
                          cds=None,
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001007553',
            'CSDE1',
            'reverse',
            115259538,
            115300624, [
                115259538, 115261234, 115262200, 115263160, 115266504,
                115267842, 115268832, 115269604, 115272879, 115273129,
                115275225, 115276353, 115276610, 115277063, 115279379,
                115280092, 115280584, 115282313, 115292442, 115300546
            ], [
                115260837, 115261366, 115262363, 115263338, 115266623,
                115267954, 115269007, 115269711, 115273043, 115273269,
                115275437, 115276478, 115276738, 115277144, 115279476,
                115280184, 115280693, 115282511, 115292828, 115300624
            ],
            'ncbi',
            transcript=1,
            cds=(115260790, 115282511),
            select_transcript=False,
            version=1))
    session.add(
        TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_001130523',
            'CSDE1',
            'reverse',
            115259538,
            115300671, [
                115259538, 115261234, 115262200, 115263160, 115266504,
                115267842, 115268832, 115269604, 115272879, 115273129,
                115275225, 115276353, 115276610, 115277063, 115279379,
                115280584, 115282313, 115284148, 115292442, 115300546
            ], [
                115260837, 115261366, 115262363, 115263338, 115266623,
                115267954, 115269007, 115269711, 115273043, 115273269,
                115275437, 115276478, 115276738, 115277144, 115279476,
                115280693, 115282511, 115284294, 115292828, 115300671
            ],
            'ncbi',
            transcript=1,
            cds=(115260790, 115284285),
            select_transcript=False,
            version=1))
    session.add(
        TranscriptMapping(chromosome_1,
                          'refseq',
                          'NM_002241',
                          'KCNJ10',
                          'reverse',
                          160007257,
                          160040051, [160007257, 160039812],
                          [160012322, 160040051],
                          'ncbi',
                          transcript=1,
                          cds=(160011183, 160012322),
                          select_transcript=False,
                          version=4))
    session.add(
        TranscriptMapping(
            chromosome_20,
            'refseq',
            'NM_001162505',
            'TMEM189',
            'reverse',
            48740274,
            48770335,
            [48740274, 48744512, 48746083, 48747402, 48760039, 48770054],
            [48741716, 48744724, 48746227, 48747484, 48760158, 48770335],
            'ncbi',
            transcript=1,
            cds=(48741595, 48770174),
            select_transcript=False,
            version=1))
    session.add(
        TranscriptMapping(
            chromosome_8,
            'refseq',
            'NM_017780',
            'CHD7',
            'forward',
            61591339,
            61779465, [
                61591339, 61653818, 61693559, 61707545, 61712947, 61714087,
                61720776, 61728946, 61732566, 61734349, 61734583, 61735062,
                61736399, 61741222, 61742881, 61748632, 61749376, 61750227,
                61750635, 61754203, 61754406, 61757423, 61757809, 61761074,
                61761610, 61763052, 61763591, 61763821, 61764578, 61765057,
                61765388, 61766922, 61768534, 61769004, 61773463, 61774755,
                61775107, 61777575
            ], [
                61591641, 61655656, 61693989, 61707686, 61713084, 61714152,
                61720831, 61729060, 61732649, 61734486, 61734704, 61735305,
                61736575, 61741365, 61743136, 61748842, 61749571, 61750394,
                61750814, 61754313, 61754611, 61757622, 61757968, 61761163,
                61761713, 61763181, 61763663, 61763878, 61764806, 61765265,
                61766059, 61767082, 61768761, 61769447, 61773684, 61774895,
                61775211, 61779465
            ],
            'ncbi',
            transcript=1,
            cds=(61653992, 61778492),
            select_transcript=False,
            version=2))
    session.add(
        TranscriptMapping(chromosome_mt,
                          'refseq',
                          'NC_012920',
                          'ND4',
                          'forward',
                          10760,
                          12137, [10760], [12137],
                          'reference',
                          transcript=1,
                          cds=(10760, 12137),
                          select_transcript=True,
                          version=1))
    session.add(
        TranscriptMapping(
            chromosome_1,
            'refseq',
            'NM_002001',
            'FCER1A',
            'forward',
            159259504,
            159278014,
            [159259504, 159272096, 159272644, 159273718, 159275778, 159277538],
            [159259543, 159272209, 159272664, 159273972, 159276035, 159278014],
            'ncbi',
            transcript=1,
            cds=(159272155, 159277722),
            select_transcript=False,
            version=2))
    session.add(
        TranscriptMapping(chromosome_7,
                          'refseq',
                          'XM_001715131',
                          'LOC100132858',
                          'reverse',
                          19828,
                          36378, [19828, 20834, 31060, 32957, 35335, 36224],
                          [19895, 21029, 31437, 33107, 35541, 36378],
                          'ncbi',
                          transcript=1,
                          cds=(19828, 36378),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004011',
            'DMD',
            'reverse',
            31137345,
            32430371, [
                31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
                31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
                31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
                31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
                31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
                31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
                32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
                32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
                32408188, 32429869, 32430279
            ], [
                31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
                31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
                31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
                31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
                31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
                31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
                32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
                32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
                32408298, 32430030, 32430371
            ],
            'ncbi',
            transcript=1,
            cds=(31140036, 32430326),
            select_transcript=False,
            version=3))
    session.add(
        TranscriptMapping(chromosome_x,
                          'refseq',
                          'NM_004019',
                          'DMD',
                          'reverse',
                          31196312,
                          31285024, [
                              31196312, 31198487, 31200855, 31222078, 31224699,
                              31227615, 31241164, 31279072, 31284927
                          ], [
                              31196922, 31198598, 31201021, 31222235, 31224784,
                              31227816, 31241238, 31279133, 31285024
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(31196782, 31284946),
                          select_transcript=False,
                          version=2))
    session.add(
        TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_004007',
            'DMD',
            'reverse',
            31137345,
            33038317, [
                31137345, 31144759, 31152219, 31164408, 31165392, 31187560,
                31190465, 31191656, 31196049, 31196786, 31198487, 31200855,
                31222078, 31224699, 31227615, 31241164, 31279072, 31341715,
                31366673, 31462598, 31496223, 31497100, 31514905, 31525398,
                31645790, 31676107, 31697492, 31747748, 31792077, 31838092,
                31854835, 31893305, 31947713, 31950197, 31986456, 32235033,
                32305646, 32328199, 32360217, 32361251, 32364060, 32366523,
                32380905, 32382699, 32383137, 32398627, 32404427, 32407618,
                32408188, 32429869, 32456358, 32459297, 32466573, 32472779,
                32481556, 32482703, 32486615, 32490281, 32503036, 32509394,
                32519872, 32536125, 32563276, 32583819, 32591647, 32591862,
                32613874, 32632420, 32662249, 32663081, 32715987, 32717229,
                32827610, 32834585, 32841412, 32862900, 32867845, 33038256
            ], [
                31140047, 31144790, 31152311, 31164531, 31165635, 31187718,
                31190530, 31191721, 31196087, 31196922, 31198598, 31201021,
                31222235, 31224784, 31227816, 31241238, 31279133, 31341775,
                31366751, 31462744, 31496491, 31497220, 31515061, 31525570,
                31645979, 31676261, 31697703, 31747865, 31792309, 31838200,
                31854936, 31893490, 31947862, 31950344, 31986631, 32235180,
                32305818, 32328393, 32360399, 32361403, 32364197, 32366645,
                32381075, 32382827, 32383316, 32398797, 32404582, 32407791,
                32408298, 32430030, 32456507, 32459431, 32466755, 32472949,
                32481711, 32482816, 32486827, 32490426, 32503216, 32509635,
                32519959, 32536248, 32563451, 32583998, 32591754, 32591963,
                32613993, 32632570, 32662430, 32663269, 32716115, 32717410,
                32827728, 32834757, 32841504, 32862977, 32867937, 33038317
            ],
            'ncbi',
            transcript=1,
            cds=(31140036, 32834745),
            select_transcript=False,
            version=2))
    session.add(
        TranscriptMapping(chromosome_x,
                          'refseq',
                          'NM_203473',
                          'PORCN',
                          'forward',
                          48367371,
                          48379202, [
                              48367371, 48368172, 48369683, 48370280, 48370714,
                              48370977, 48371223, 48372628, 48372913, 48374105,
                              48374278, 48374449, 48375571, 48378763
                          ], [
                              48367491, 48368344, 48369875, 48370323, 48370895,
                              48371107, 48371240, 48372753, 48373013, 48374181,
                              48374341, 48374534, 48375681, 48379202
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(48368209, 48378864),
                          select_transcript=False,
                          version=1))
    session.add(
        TranscriptMapping(
            chromosome_x,
            'refseq',
            'NM_000132',
            'F8',
            'reverse',
            154064063,
            154250998, [
                154064063, 154088707, 154089993, 154091358, 154124352,
                154128141, 154129646, 154130326, 154132181, 154132571,
                154133086, 154134695, 154156846, 154175973, 154182167,
                154185232, 154189350, 154194245, 154194701, 154197606,
                154212962, 154215512, 154221211, 154225248, 154227754,
                154250685
            ], [
                154066027, 154088883, 154090141, 154091502, 154124507,
                154128226, 154129717, 154130442, 154132363, 154132799,
                154133298, 154134848, 154159951, 154176182, 154182317,
                154185446, 154189443, 154194416, 154194962, 154197827,
                154213078, 154215580, 154221423, 154225370, 154227875,
                154250998
            ],
            'ncbi',
            transcript=1,
            cds=(154065872, 154250827),
            select_transcript=False,
            version=3))
    session.add(
        TranscriptMapping(chromosome_3,
                          'refseq',
                          'NM_000249',
                          'MLH1',
                          'forward',
                          37034841,
                          37092337, [
                              37034841, 37038110, 37042446, 37045892, 37048482,
                              37050305, 37053311, 37053502, 37055923, 37058997,
                              37061801, 37067128, 37070275, 37081677, 37083759,
                              37089010, 37090008, 37090395, 37091977
                          ], [
                              37035154, 37038200, 37042544, 37045965, 37048554,
                              37050396, 37053353, 37053590, 37056035, 37059090,
                              37061954, 37067498, 37070423, 37081785, 37083822,
                              37089174, 37090100, 37090508, 37092337
                          ],
                          'ncbi',
                          transcript=1,
                          cds=(37035039, 37092144),
                          select_transcript=False,
                          version=3))

    session.commit()
Esempio n. 36
0
    def retrieveslice(self, accno, start, stop, orientation) :
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        @arg accno: The accession number of the chromosome
        @type accno: unicode
        @arg start: Start position of the slice
        @type start: integer
        @arg stop: End position of the slice.
        @type stop: integer
        @arg orientation:
        Orientation of the slice:
            - 1 ; Forward
            - 2 ; Reverse complement
        @type orientation: integer

        @return: An UD number
        @rtype: unicode
        """

        # Not a valid slice.
        if start >= stop :
            return None

        # The slice can not be too big.
        if stop - start > settings.MAX_FILE_SIZE:
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._nametofile(reference.accession)) : # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            handle = Entrez.efetch(db='nuccore', rettype='gb', retmode='text',
                                   id=accno, seq_start=start, seq_stop=stop,
                                   strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(__file__, -1, 'INFO',
                                    'Error connecting to Entrez nuccore database: %s' % unicode(e))
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calcHash(raw_data)

        if reference is not None: # We have seen this one before.
            currentmd5sum = reference.checksum

            if md5sum != currentmd5sum :
                self._output.addMessage(__file__, -1, "WHASH",
                    "Warning: Hash of %s changed from %s to %s." % (
                    reference.accession, currentmd5sum, md5sum))
                Reference.query.filter_by(accession=reference.accession).update({'checksum': md5sum})
                session.commit()
            #if
        else : # We haven't seen it before, so give it a name.
            UD = self._newUD()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(UD, md5sum, slice_accession=accno,
                                  slice_start=start, slice_stop=stop,
                                  slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()
        #else

        if self.write(raw_data, reference.accession, 0):
            return reference.accession
Esempio n. 37
0
def import_from_mapview_file(assembly, mapview_file, group_label):
    """
    Import transcript mappings from an NCBI mapview file.

    We require that this file is first sorted on the `feature_id` column
    (#11), which always contains the gene identifier, and then on the
    `chromosome` column (#2).

        sort -t $'\t' -k 11,11 -k 2,2 seq_gene.md > seq_gene.by_gene.md

    Raises :exc:`ValueError` if `mapview_file` is not sorted this way.

    The NCBI mapping file consists of entries, one per line, in order of
    their location in the genome (more specifically by start location).
    Every entry has a 'group_label' column, denoting the assembly it is
    from. We only use entries where this value is `group_label`.

    There are four types of entries (for our purposes):
    - Gene: Name, identifier, and location of a gene.
    - Transcript: Name, gene id, and location of a transcript.
    - UTR: Location and transcript of a non-coding exon (or part of it).
    - CDS: Location and transcript of a coding exon (or part of it).

    A bit troublesome for us is that exons are split in UTR exons and CDS
    exons, with exons overlapping the UTR/CDS border defined as two
    separate entries (one of type UTR and one of type CDS).

    Another minor annoyance is that some transcripts (~ 15) are split over
    two contigs (NT_*). In that case, they are defined by two entries in
    the file, where we should merge them by taking the start position of
    the first and the stop position of the second.

    To complicate this annoyance, some genes (e.g. in the PAR) are mapped
    on both the X and Y chromosomes, but stored in the file just like the
    transcripts split over two contigs. However, these ones should of
    course not be merged.

    Our strategy is too sort by gene and chromosome and process the file
    grouped by these two fields.

    For transcripts without any UTR and CDS entries (seems to happen for
    predicted genes), we generate one exon spanning the entire transcript.

    All positions are one-based, inclusive, and that is what we also use in
    our database.
    """
    columns = [
        'taxonomy', 'chromosome', 'start', 'stop', 'orientation', 'contig',
        'ctg_start', 'ctg_stop', 'ctg_orientation', 'feature_name',
        'feature_id', 'feature_type', 'group_label', 'transcript',
        'evidence_code'
    ]

    chromosomes = assembly.chromosomes.all()

    def read_records(mapview_file):
        for line in mapview_file:
            if line.startswith('#'):
                continue
            record = dict(zip(columns, line.rstrip().split('\t')))

            # Only use records from the given assembly.
            if record['group_label'] != group_label:
                continue

            # Only use records on chromosomes we know.
            try:
                record['chromosome'] = next(c for c in chromosomes
                                            if c.name == 'chr' +
                                            record['chromosome'])
            except StopIteration:
                continue

            record['start'] = int(record['start'])
            record['stop'] = int(record['stop'])

            yield record

    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
        by_transcript = defaultdict(lambda: defaultdict(list))
        for r in records:
            by_transcript[r['transcript']][r['feature_type']].append(r)

        gene = by_transcript['-']['GENE'][0]['feature_name']

        for transcript, by_type in by_transcript.items():
            if transcript == '-':
                continue
            accession, version = transcript.split('.')
            version = int(version)
            chromosome = by_type['RNA'][0]['chromosome']
            orientation = 'reverse' if by_type['RNA'][0][
                'orientation'] == '-' else 'forward'
            start = min(t['start'] for t in by_type['RNA'])
            stop = max(t['stop'] for t in by_type['RNA'])

            exon_starts = []
            exon_stops = []
            cds_positions = []
            for exon in sorted(by_type['UTR'] + by_type['CDS'],
                               key=itemgetter('start')):
                if exon_stops and exon_stops[-1] > exon['start'] - 1:
                    # This exon starts before the end of the previous exon. We
                    # have no idea what to do in this case, so we ignore it.
                    # The number of transcripts affected is very small (e.g.,
                    # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly).
                    continue
                if exon['feature_type'] == 'CDS':
                    cds_positions.extend([exon['start'], exon['stop']])
                if exon_stops and exon_stops[-1] == exon['start'] - 1:
                    # This exon must be merged with the previous one because
                    # it is split over two entries (a CDS part and a UTR part
                    # or split over different reference contigs).
                    exon_stops[-1] = exon['stop']
                else:
                    exon_starts.append(exon['start'])
                    exon_stops.append(exon['stop'])

            if cds_positions:
                cds = min(cds_positions), max(cds_positions)
            else:
                cds = None

            # If no exons are annotated, we create one spanning the entire
            # transcript.
            if not exon_starts:
                exon_starts = [start]
                exon_stops = [stop]

            yield TranscriptMapping.create_or_update(chromosome,
                                                     'refseq',
                                                     accession,
                                                     gene,
                                                     orientation,
                                                     start,
                                                     stop,
                                                     exon_starts,
                                                     exon_stops,
                                                     'ncbi',
                                                     cds=cds,
                                                     version=version)

    processed_keys = set()

    for key, records in groupby(read_records(mapview_file),
                                itemgetter('feature_id', 'chromosome')):
        if key in processed_keys:
            raise MapviewSortError('Mapview file must be sorted by feature_id '
                                   'and chromosome (try `sort -k 11,11 -k '
                                   '2,2`)')
        processed_keys.add(key)

        for mapping in build_mappings(records):
            session.add(mapping)

    session.commit()
Esempio n. 38
0
    def retrieveslice(self, accno, start, stop, orientation):
        """
        Retrieve a slice of a chromosome.
        If the arguments are recognised (found in the internal database),
        we look if the associated file is still present and if so: return
        its UD number.
        If the arguments are recognised but no file was found, we download
        the new slice and update the hash (and log if the hash changes).
        If the arguments are not recognised, we download the new slice and
        make a new UD number.
        The content of the slice is placed in the cache with the UD number
        as filename.

        :arg unicode accno: The accession number of the chromosome.
        :arg int start: Start position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int stop: End position of the slice (one-based, inclusive, in
          reference orientation).
        :arg int orientation: Orientation of the slice:
            - 1 ; Forward.
            - 2 ; Reverse complement.

        :returns: An UD number.
        :rtype: unicode
        """
        # Not a valid slice.
        if start > stop:
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice for start '
                                    'position greater than stop position.')
            return None

        # The slice can not be too big.
        if stop - start + 1 > settings.MAX_FILE_SIZE:
            self._output.addMessage(__file__, 4, 'ERETR',
                                    'Could not retrieve slice (request '
                                    'exceeds maximum of %d bases)' %
                                    settings.MAX_FILE_SIZE)
            return None

        slice_orientation = ['forward', 'reverse'][orientation - 1]

        # Check whether we have seen this slice before.
        try:
            reference = Reference.query.filter_by(
                slice_accession=accno, slice_start=start, slice_stop=stop,
                slice_orientation=slice_orientation).one()
        except NoResultFound:
            reference = None
        else:
            if os.path.isfile(self._name_to_file(reference.accession)):
                # It's still present.
                return reference.accession

        # It's not present, so download it.
        try:
            # EFetch `seq_start` and `seq_stop` are one-based, inclusive, and
            # in reference orientation.
            handle = Entrez.efetch(
                db='nuccore', rettype='gb', retmode='text', id=accno,
                seq_start=start, seq_stop=stop, strand=orientation)
            raw_data = handle.read()
            handle.close()
        except (IOError, urllib2.HTTPError, HTTPException) as e:
            self._output.addMessage(
                __file__, -1, 'INFO',
                'Error connecting to Entrez nuccore database: {}'.format(
                    unicode(e)))
            self._output.addMessage(
                __file__, 4, 'ERETR', 'Could not retrieve slice.')
            return None

        # Calculate the hash of the downloaded file.
        md5sum = self._calculate_hash(raw_data)

        if reference is not None:
            # We have seen this one before.
            current_md5sum = reference.checksum

            if md5sum != current_md5sum:
                self._output.addMessage(
                    __file__, -1, 'WHASH',
                    'Warning: Hash of {} changed from {} to {}.'.format(
                        reference.accession, current_md5sum, md5sum))
                Reference.query.filter_by(
                    accession=reference.accession).update({'checksum': md5sum})
                session.commit()
        else:
            # We haven't seen it before, so give it a name.
            ud = self._new_ud()
            slice_orientation = ['forward', 'reverse'][orientation - 1]
            reference = Reference(
                ud, md5sum, slice_accession=accno, slice_start=start,
                slice_stop=stop, slice_orientation=slice_orientation)
            session.add(reference)
            session.commit()

        if self.write(raw_data, reference.accession, 0):
            return reference.accession