Esempio n. 1
0
    def build_mapping(record):
        # Only use records on chromosomes we know.
        try:
            chromosome = next(c for c in chromosomes
                              if c.name == 'chr' + record['chromosome'])
        except StopIteration:
            raise ValueError()

        accession, transcript = record['transcript'].split('t')
        transcript = int(transcript)

        orientation = 'reverse' if record['strand'] == '-1' else 'forward'

        if record['cds_start']:
            cds = record['cds_start'], record['cds_stop']
        else:
            cds = None

        # TODO: Also take protein into account. For example, in LRG_321 (TP53)
        # some transcripts occur twice (with different CDSs and different
        # protein numbers).
        # https://github.com/mutalyzer/mutalyzer/issues/372
        return TranscriptMapping.create_or_update(
            chromosome,
            'lrg',
            accession,
            record['gene'],
            orientation,
            record['start'],
            record['stop'], [start for start, _ in record['exons']],
            [stop for _, stop in record['exons']],
            'ebi',
            transcript=transcript,
            cds=cds,
            select_transcript=True)
Esempio n. 2
0
    def build_mapping(record):
        # Only use records on chromosomes we know.
        try:
            chromosome = next(c for c in chromosomes if
                              c.name == 'chr' + record['chromosome'])
        except StopIteration:
            raise ValueError()

        accession, transcript = record['transcript'].split('t')
        transcript = int(transcript)

        orientation = 'reverse' if record['strand'] == '-1' else 'forward'

        if record['cds_start']:
            cds = record['cds_start'], record['cds_stop']
        else:
            cds = None

        # TODO: Also take protein into account. For example, in LRG_321 (TP53)
        # some transcripts occur twice (with different CDSs and different
        # protein numbers).
        # https://github.com/mutalyzer/mutalyzer/issues/372
        return TranscriptMapping.create_or_update(
            chromosome, 'lrg', accession, record['gene'], orientation,
            record['start'], record['stop'],
            [start for start, _ in record['exons']],
            [stop for _, stop in record['exons']],
            'ebi', transcript=transcript, cds=cds, select_transcript=True)
Esempio n. 3
0
    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
        by_transcript = defaultdict(lambda: defaultdict(list))
        for r in records:
            by_transcript[r['transcript']][r['feature_type']].append(r)

        gene = by_transcript['-']['GENE'][0]['feature_name']

        for transcript, by_type in by_transcript.items():
            if transcript == '-':
                continue
            accession, version = transcript.split('.')
            version = int(version)
            chromosome = by_type['RNA'][0]['chromosome']
            orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward'
            start = min(t['start'] for t in by_type['RNA'])
            stop = max(t['stop'] for t in by_type['RNA'])

            exon_starts = []
            exon_stops = []
            cds_positions = []
            for exon in sorted(by_type['UTR'] + by_type['CDS'],
                               key=itemgetter('start')):
                if exon_stops and exon_stops[-1] > exon['start'] - 1:
                    # This exon starts before the end of the previous exon. We
                    # have no idea what to do in this case, so we ignore it.
                    # The number of transcripts affected is very small (e.g.,
                    # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly).
                    continue
                if exon['feature_type'] == 'CDS':
                    cds_positions.extend([exon['start'], exon['stop']])
                if exon_stops and exon_stops[-1] == exon['start'] - 1:
                    # This exon must be merged with the previous one because
                    # it is split over two entries (a CDS part and a UTR part
                    # or split over different reference contigs).
                    exon_stops[-1] = exon['stop']
                else:
                    exon_starts.append(exon['start'])
                    exon_stops.append(exon['stop'])

            if cds_positions:
                cds = min(cds_positions), max(cds_positions)
            else:
                cds = None

            # If no exons are annotated, we create one spanning the entire
            # transcript.
            if not exon_starts:
                exon_starts = [start]
                exon_stops = [stop]

            yield TranscriptMapping.create_or_update(
                chromosome, 'refseq', accession, gene, orientation, start,
                stop, exon_starts, exon_stops, 'ncbi', cds=cds,
                version=version)
Esempio n. 4
0
    def build_mappings(records):
        # We structure the records per transcript and per record type. This is
        # generalized to a list of records for each type, but we expect only
        # one GENE record (with `-` as transcript value).
        # Note that there can be more than one RNA record per transcript if it
        # is split over different reference contigs.
        by_transcript = defaultdict(lambda: defaultdict(list))
        for r in records:
            by_transcript[r['transcript']][r['feature_type']].append(r)

        gene = by_transcript['-']['GENE'][0]['feature_name']

        for transcript, by_type in by_transcript.items():
            if transcript == '-':
                continue
            accession, version = transcript.split('.')
            version = int(version)
            chromosome = by_type['RNA'][0]['chromosome']
            orientation = 'reverse' if by_type['RNA'][0]['orientation'] == '-' else 'forward'
            start = min(t['start'] for t in by_type['RNA'])
            stop = max(t['stop'] for t in by_type['RNA'])

            exon_starts = []
            exon_stops = []
            cds_positions = []
            for exon in sorted(by_type['UTR'] + by_type['CDS'],
                               key=itemgetter('start')):
                if exon_stops and exon_stops[-1] > exon['start'] - 1:
                    # This exon starts before the end of the previous exon. We
                    # have no idea what to do in this case, so we ignore it.
                    # The number of transcripts affected is very small (e.g.,
                    # NM_031860.1 and NM_001184961.1 in the GRCh37 assembly).
                    continue
                if exon['feature_type'] == 'CDS':
                    cds_positions.extend([exon['start'], exon['stop']])
                if exon_stops and exon_stops[-1] == exon['start'] - 1:
                    # This exon must be merged with the previous one because
                    # it is split over two entries (a CDS part and a UTR part
                    # or split over different reference contigs).
                    exon_stops[-1] = exon['stop']
                else:
                    exon_starts.append(exon['start'])
                    exon_stops.append(exon['stop'])

            if cds_positions:
                cds = min(cds_positions), max(cds_positions)
            else:
                cds = None

            # If no exons are annotated, we create one spanning the entire
            # transcript.
            if not exon_starts:
                exon_starts = [start]
                exon_stops = [stop]

            yield TranscriptMapping.create_or_update(
                chromosome, 'refseq', accession, gene, orientation, start,
                stop, exon_starts, exon_stops, 'ncbi', cds=cds,
                version=version)
Esempio n. 5
0
def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='******',
                                 host='genome-mysql.cse.ucsc.edu',
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
    parameters = gene,

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(chromosome,
                                                     'refseq',
                                                     acc,
                                                     geneName,
                                                     orientation,
                                                     txStart + 1,
                                                     txEnd,
                                                     exon_starts,
                                                     exon_stops,
                                                     'ucsc',
                                                     cds=cds,
                                                     version=int(version))
        session.add(mapping)

    session.commit()
Esempio n. 6
0
def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome,
            'refseq',
            record.source_accession,
            gene.name,
            orientation,
            start,
            stop, [start], [stop],
            'reference',
            cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()
Esempio n. 7
0
def import_from_reference(assembly, reference):
    """
    Import transcript mappings from a genomic reference.

    .. todo: Also report how much was added/updated.

    .. note: Currently no exon locations are supported, this has only been
       tested on mtDNA.
    """
    chromosome = assembly.chromosomes.filter_by(name='chrM').one()

    output = Output(__file__)
    retriever = Retriever.GenBankRetriever(output)
    record = retriever.loadrecord(reference)

    if record.molType != 'm':
        raise ValueError('Only mitochondial references are supported')

    select_transcript = len(record.geneList) > 1

    for gene in record.geneList:
        # We support exactly one transcript per gene.
        try:
            transcript = sorted(gene.transcriptList, key=attrgetter('name'))[0]
        except IndexError:
            continue

        # We use gene.location for now, it is always present and the same
        # for our purposes.
        #start, stop = transcript.mRNA.location[0], transcript.mRNA.location[1]
        start, stop = gene.location

        orientation = 'reverse' if gene.orientation == -1 else 'forward'

        try:
            cds = transcript.CDS.location
        except AttributeError:
            cds = None

        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', record.source_accession, gene.name,
            orientation, start, stop, [start], [stop], 'reference', cds=cds,
            select_transcript=select_transcript,
            version=int(record.source_version))
        session.add(mapping)

    session.commit()
Esempio n. 8
0
def import_from_ucsc_by_gene(assembly, gene):
    """
    Import transcript mappings for a gene from the UCSC.
    """
    connection = MySQLdb.connect(user='******',
                                 host='genome-mysql.cse.ucsc.edu',
                                 db=assembly.alias,
                                 charset='utf8',
                                 use_unicode=True)

    query = """
        SELECT DISTINCT
          acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts,
          exonEnds, name2 AS geneName, chrom, strand, protAcc
        FROM gbStatus, refGene, refLink
        WHERE type = "mRNA"
        AND refGene.name = acc
        AND acc = mrnaAcc
        AND name2 = %s
    """
    parameters = gene,

    cursor = connection.cursor()
    cursor.execute(query, parameters)
    result = cursor.fetchall()
    cursor.close()

    # All ranges in the UCSC tables are zero-based and open-ended. We convert
    # this to one-based, inclusive for our database.

    for (acc, version, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds,
         geneName, chrom, strand, protAcc) in result:
        chromosome = assembly.chromosomes.filter_by(name=chrom).one()
        orientation = 'reverse' if strand == '-' else 'forward'
        exon_starts = [int(i) + 1 for i in exonStarts.split(',') if i]
        exon_stops = [int(i) for i in exonEnds.split(',') if i]
        if cdsStart and cdsEnd:
            cds = cdsStart + 1, cdsEnd
        else:
            cds = None
        mapping = TranscriptMapping.create_or_update(
            chromosome, 'refseq', acc, geneName, orientation, txStart + 1,
            txEnd, exon_starts, exon_stops, 'ucsc', cds=cds,
            version=int(version))
        session.add(mapping)

    session.commit()