def td_contents(selfself, item, attr_list):
        text = item.sequence_id

        if text == 'Totals':
            return Markup('<strong>' + text + '</strong>')

        imgt_ref = get_imgt_reference_genes()
        if item.sequence_id not in imgt_ref[
                item.genotype_description.submission.species]:
            text = '<em>' + text + '</em>'
        if len(item.inferred_sequences) > 0:
            text = '<strong>' + text + '</strong>'
        return Markup(text)
Exemple #2
0
    def get(self, species, name):
        """ Returns the sequence given the IMGT name """
        imgt_ref = get_imgt_reference_genes()
        imgt_ref_gapped = get_imgt_gapped_reference_genes()

        if species in imgt_ref and name in imgt_ref[species]:
            ungapped = str(imgt_ref[species][name])
            gapped = str(imgt_ref_gapped[species][name])

            return {
                'species': species,
                'imgt_name': name,
                'sequence': ungapped,
                'coding_seq_imgt': gapped
            }
        else:
            return 'Not found', 404
Exemple #3
0
    def get(self, species):
        """ Returns the set of IARC-affirmed sequences for the species """
        imgt_ref = get_imgt_reference_genes()

        if species not in imgt_ref:
            return 'Not found', 404

        all_species = db.session.query(Committee.species).all()
        all_species = [s[0] for s in all_species]
        if species not in all_species:
            return []

        q = db.session.query(GeneDescription).filter(
            GeneDescription.status == 'published',
            GeneDescription.affirmation_level != '0',
            GeneDescription.species == species)
        results = q.all()

        dl = self.descs_to_airr(results)
        return dl
def download_sequences(species, format, exc):
    if format not in ['gapped','ungapped','airr']:
        flash('Invalid format')
        return redirect('/')

    all_species = db.session.query(Committee.species).all()
    all_species = [s[0] for s in all_species]
    if species not in all_species:
        flash('Invalid species')
        return redirect('/')

    q = db.session.query(GeneDescription).filter(GeneDescription.status == 'published', GeneDescription.affirmation_level != '0', GeneDescription.species == species)
    results = q.all()

    imgt_ref = get_imgt_reference_genes()
    if species in imgt_ref and exc == 'non':
        descs = []
        for result in results:
            if result.imgt_name == '':
                descs.append(result)
        results = descs

    if len(results) < 1:
        flash('No sequences to download')
        return redirect('/')

    if format == 'airr':
        ad = []
        for desc in results:
            ad.append(vars(AIRRAlleleDescription(desc)))

        dl = json.dumps(ad, default=str, indent=4)
        ext = 'json'
    else:
        dl = descs_to_fasta(results, format)
        ext = 'fa'

    filename = 'affirmed_germlines_%s_%s.%s' % (species, format, ext)
    return Response(dl, mimetype="application/octet-stream", headers={"Content-disposition": "attachment; filename=%s" % filename})
Exemple #5
0
    def get(self):
        """ Returns the species for which sequences are available """
        imgt_ref = get_imgt_reference_genes()

        return {'species': list(imgt_ref.keys())}
Exemple #6
0
def generate_stats(form):
    species = form.species.data
    locus = form.locus.data
    sequence_type = form.sequence_type.data

    imgt_ref = get_imgt_reference_genes()
    if species not in imgt_ref:
        return (0, None, None)

    def gene_match(gene, ref):
        for k,v in ref.items():
            if gene in k:
                return True
        return False

    rare_genes = form.rare_genes.data.replace(' ', '').split(',')
    rare_missing = []
    for gene in rare_genes:
        if not gene_match(gene, imgt_ref[species]):
            rare_missing.append(gene)
    if len(rare_missing) > 0:
        form.rare_genes.errors = ['Gene(s) %s not found in IMGT reference database' % ', '.join(rare_missing)]

    very_rare_genes = form.very_rare_genes.data.replace(' ', '').split(',')
    very_rare_missing = []
    for gene in very_rare_genes:
        if not gene_match(gene, imgt_ref[species]):
            very_rare_missing.append(gene)
    if len(very_rare_missing) > 0:
        form.very_rare_genes.errors = ['Gene(s) %s not found in IMGT reference database' % ', '.join(very_rare_missing)]

    if len(rare_missing) > 0 or len(very_rare_missing) > 0:
        return (0, None, None)

    ref = []

    for gene in imgt_ref[species].keys():
        if locus in gene and gene[3] == sequence_type:
            ref.append(gene)

    # Calculate thresholds for each reference gene

    gene_thresh = {}

    for gene in imgt_ref[species].keys():
        gene_thresh[gene] =  form.freq_threshold.data
        for rg in rare_genes:
            if rg in gene:
                gene_thresh[gene] = form.rare_threshold.data
        for rg in very_rare_genes:
            if rg in gene:
                gene_thresh[gene] = form.very_rare_threshold.data

    # Get unique list of genotype descriptions that underlie affirmed inferences

    genotype_descriptions = []
    seqs = db.session.query(GeneDescription).filter(GeneDescription.status == 'published',
                                                    GeneDescription.species == species,
                                                    GeneDescription.locus == locus,
                                                    GeneDescription.sequence_type == sequence_type,
                                                    GeneDescription.affirmation_level != '0'
                                                    ).all()

    for seq in seqs:
        for genotype in seq.inferred_sequences:
            if genotype.genotype_description not in genotype_descriptions:
                genotype_descriptions.append(genotype.genotype_description)

    if len(genotype_descriptions) == 0:
        return (0, None, None)

    # Initialise stats

    stats = {}
    for name in ref:
        if '/OR' not in name:
            stats[name] = {'occurrences': 0, 'unmutated_freq': [], 'gene': name}

    stats = OrderedDict(sorted(stats.items(), key=lambda name: parse_name(name[0])[2]))
    stats = OrderedDict(sorted(stats.items(), key=lambda name: parse_name(name[0])[1]))
    stats = OrderedDict(sorted(stats.items(), key=lambda name: parse_name(name[0])[0]))

    raw = {}
    for name in ref:
        if '/OR' not in name:
            raw[name] = {'gene': name}

    raw = OrderedDict(sorted(raw.items(), key=lambda name: parse_name(name[0])[2]))
    raw = OrderedDict(sorted(raw.items(), key=lambda name: parse_name(name[0])[1]))
    raw = OrderedDict(sorted(raw.items(), key=lambda name: parse_name(name[0])[0]))

    # Compose stats

    gen_names = ['gene']

    for desc in genotype_descriptions:
        gen_name = "%s/%s" % (desc.submission.submission_id, desc.genotype_name)
        gen_names.append(gen_name)
        for gen in desc.genotypes:
            if gen.sequence_id in stats \
              and (gen.allelic_percentage is None or gen.allelic_percentage==0 or gen.allelic_percentage >= form.allelic_threshold.data) \
              and (gen.assigned_unmutated_frequency is None or gen.assigned_unmutated_frequency >= form.assigned_unmutated_threshold.data) \
              and (gen.unmutated_frequency is not None and gen.unmutated_frequency >= gene_thresh[gen.sequence_id]):
                stats[gen.sequence_id]['occurrences'] += 1
                stats[gen.sequence_id]['unmutated_freq'].append(gen.unmutated_frequency)
            if gen.sequence_id in raw and (gen.allelic_percentage is None or gen.allelic_percentage >= form.allelic_threshold.data):
                raw[gen.sequence_id][gen_name] = gen.unmutated_frequency

    for (k, stat) in stats.items():
        stats[k]['unmutated_freq'] = round(sum(stat['unmutated_freq'])/max(len(stat['unmutated_freq']),1), 2)

    ret = []
    for(k, stat) in stats.items():
        ret.append(stat)

    ro = StringIO()
    writer = csv.DictWriter(ro, fieldnames=gen_names)
    writer.writeheader()
    for gene in raw:
        writer.writerow(raw[gene])

    return (len(genotype_descriptions), ret, ro)
def setup_gv_table(desc):
    table = make_Genotype_full_table(desc.genotypes,
                                     desc.locus,
                                     False,
                                     classes=['table-bordered'])
    #    table.items = list(table.items)
    table._cols['sequence_id'] = GenTitleCol(
        'Allele name',
        tooltip=
        'Identifier of the allele (either IMGT, or the name assigned by the submitter to an inferred gene)'
    )
    table.rotate_header = True
    table.add_column(
        'nt_sequence',
        SeqCol('Sequence', tooltip="Click to view or download sequence"))
    table.table_id = 'genotype_table'

    # Add totals row

    totals = Genotype()
    totals.sequence_id = 'Totals'
    totals.sequences = 0
    totals.unmutated_sequences = 0
    lh_seqs = 0

    for gen in desc.genotypes:
        try:
            if gen.haplotyping_ratio and ':' in gen.haplotyping_ratio:
                lh = int(gen.haplotyping_ratio.split(':')[0])
            totals.sequences += gen.sequences
            totals.unmutated_sequences += gen.unmutated_sequences
            lh_seqs += lh * totals.sequences / 100
        except:
            pass

    if lh_seqs > 0:
        lh_prop = round(100 * lh_seqs / totals.sequences)
        totals.haplotyping_ratio = "%d:%d" % (lh_prop, (100 - lh_prop))
    totals.assigned_unmutated_frequency = round(
        100 * totals.unmutated_sequences / totals.sequences, 2)
    table.items.append(totals)

    inferred_seqs = []

    for inf in desc.inferred_sequences:
        inferred_seqs.append(inf.sequence_details.sequence_id)

    novel = []
    imgt_ref = get_imgt_reference_genes()
    for item in desc.genotypes:
        if item.sequence_id != 'Totals' and (
                item.sequence_id
                not in imgt_ref[item.genotype_description.submission.species]
                or item.sequence_id in inferred_seqs):
            novel.append(item)

    inferred_table = make_Genotype_novel_table(novel,
                                               False,
                                               classes=['table-bordered'])
    inferred_table._cols['sequence_id'] = GenTitleCol(
        'Allele name',
        tooltip=
        'Identifier of the allele (either IMGT, or the name assigned by the submitter to an inferred gene)'
    )
    inferred_table.add_column(
        'nt_sequence',
        SeqCol('Sequence', tooltip="Click to view or download sequence"))
    inferred_table.rotate_header = True

    return (table, inferred_table)
    def td_contents(self, item, attr_list):
        if not item.nt_sequence:  # e.g. for totals column
            return ''

        imgt_ref = get_imgt_reference_genes()
        imgt_ref_gapped = get_imgt_gapped_reference_genes()
        ref_codon_usage = get_reference_v_codon_usage()

        if item.sequence_id in imgt_ref[
                item.genotype_description.submission.species]:
            if item.nt_sequence.lower() == imgt_ref[
                    item.genotype_description.submission.species][
                        item.sequence_id]:
                icon = 'glyphicon-ok'
                colour = 'text-info'
                aln_text = ' data-toggle="tooltip" title="Agrees with Reference"'
            else:
                icon = 'glyphicon-remove'
                colour = 'text-danger'
                alignments = pairwise2.align.globalms(
                    item.nt_sequence.lower(),
                    imgt_ref[item.genotype_description.submission.species][
                        item.sequence_id],
                    2,
                    -1,
                    -2,
                    -1,
                    one_alignment_only=True)
                alignment = format_aln(format_alignment(*alignments[0]),
                                       item.sequence_id, 'Reference', 50)
                fasta_seqs = format_fasta_sequence(
                    item.sequence_id, item.nt_sequence.lower(),
                    50) + format_fasta_sequence(
                        'Reference',
                        imgt_ref[item.genotype_description.submission.species][
                            item.sequence_id], 50)
                aln_text = Markup(
                    ' id="btn_view_check" data-target="#seqModal" data-sequence="%s" data-name="%s" data-fa="%s" data-toggle="modal" title="Differs from Reference (click to view)"'
                    % (alignment, item.sequence_id, fasta_seqs))


            bt_check = '<button type="button" class="btn btn-xs %s icon_back" %s><span class="glyphicon %s"></span>&nbsp;</button>' \
                        % (colour, aln_text, icon)
        else:
            bt_check = ''

        bt_igpdb = ''

        if item.genotype_description.submission.species == 'Human':
            igpdb_genes = get_igpdb_ref()
            for k, v in igpdb_genes.items():
                if item.nt_sequence.lower(
                ) in v or v in item.nt_sequence.lower():
                    bt_igpdb = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Sequence matches IGPDB gene %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % k
                    break

        bt_vdjbase = ''

        if item.genotype_description.submission.species == 'Human' and item.sequence_id not in imgt_ref[
                item.genotype_description.submission.species]:
            vdjbase_ref = get_vdjbase_ref()
            vdjbase_species = item.genotype_description.submission.species.replace(
                'Human_TCR', 'Human')
            locus = item.genotype_description.locus
            if vdjbase_species in vdjbase_ref and locus in vdjbase_ref[
                    vdjbase_species]:
                vdjbase_genes = vdjbase_ref[vdjbase_species][locus]
                for vdjbase_name, (vdjbase_seq,
                                   vdjbase_count) in vdjbase_genes.items():
                    if item.nt_sequence.lower(
                    ) in vdjbase_seq or vdjbase_seq in item.nt_sequence.lower(
                    ):
                        bt_vdjbase = '<button type="button" name="vdjbasebtn" id="vdjbasebtn" class="btn btn-xs text-info icon_back"  onclick="window.open(%s)" data-toggle="tooltip" title="Sequence matches VDJbase gene %s (found in %s subjects). Click to view in VDJbase."><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % \
                                     (Markup("'%sgenerep/%s/%s/%s'" % (app.config['VDJBASE_URL'], vdjbase_species, locus, vdjbase_name)), vdjbase_name, vdjbase_count)
                        break

        bt_indels = ''
        bt_imgt = ''
        bt_codon_usage = ''
        bt_runs = ''
        bt_hotspots = ''
        bt_ref_found = ''
        annots = []

        if item.sequence_id not in imgt_ref[
                item.genotype_description.submission.species]:
            if item.closest_reference not in imgt_ref[
                    item.genotype_description.submission.species]:
                bt_ref_found = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Nearest reference not found in IMGT reference set"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>'
            else:
                for k, v in imgt_ref[
                        item.genotype_description.submission.species].items():
                    if item.nt_sequence.lower() == v:
                        bt_imgt = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Sequence matches IMGT gene %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % k
                        break

                # QA Checks

                # Alignment issues

                ref_nt = imgt_ref[item.genotype_description.submission.
                                  species][item.closest_reference].upper()
                seq_nt = item.nt_sequence.upper()

                mismatch = 0
                aligned = True

                for (r, s) in zip(ref_nt, seq_nt):
                    if r != s:
                        mismatch += 1

                if mismatch > 20:
                    aligned = False
                    bt_indels = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Sequence has indels/low match when compared to reference sequence"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>'

                if aligned:
                    # Check for unusual AAs at each position

                    if item.genotype_description.sequence_type == 'V' and item.genotype_description.locus + 'V' in ref_codon_usage[
                            item.genotype_description.submission.species]:
                        try:
                            q_codons = []
                            ref_aa_gapped = imgt_ref_gapped[
                                item.genotype_description.submission.species][
                                    item.closest_reference].upper().translate(
                                        gap='.')
                            seq_aa = Seq(item.nt_sequence.upper()).translate()

                            seq_aa_gapped = gap_sequence(seq_aa, ref_aa_gapped)
                            family = find_family(item.closest_reference)

                            for i in range(
                                    min(len(ref_aa_gapped),
                                        len(seq_aa_gapped))):
                                if ref_aa_gapped[i] != seq_aa_gapped[
                                        i] and '*' not in (
                                            ref_aa_gapped[i], seq_aa_gapped[i]
                                        ) and '.' not in (ref_aa_gapped[i],
                                                          seq_aa_gapped[i]):
                                    if seq_aa_gapped[i] not in ref_codon_usage[
                                            item.genotype_description.
                                            submission.species][
                                                item.genotype_description.locus
                                                + 'V'][family][i + 1]:
                                        q_codons.append(
                                            "%s%d" % (seq_aa_gapped[i], i + 1))
                                        j = len(seq_aa_gapped[:i].replace(
                                            '.', ''))
                                        annots.append((
                                            3 * j, 3,
                                            '%s%d previously unreported in this family'
                                            % (seq_aa_gapped[i], i + 1)))

                            if len(q_codons) > 0:
                                bt_codon_usage = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Amino Acid(s) previously unreported in this family: %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(
                                    q_codons)

                        except:
                            bt_codon_usage = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Error translating sequence: %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % sys.exc_info(
                            )[0]

                    # Check for lengthened strings of the same base

                    seq_qpos = [
                        m.start()
                        for m in re.finditer('(.)\\1+\\1+\\1+', str(seq_nt))
                    ]
                    q_runs = []

                    # walk up each identified repeat of 4nt or more, flag any differences
                    for p in seq_qpos:
                        rep_c = seq_nt[p]
                        i = p
                        while i < len(seq_nt) and i < len(
                                ref_nt) and seq_nt[i] == rep_c:
                            if ref_nt[i] != rep_c:
                                q_runs.append("%d" % find_gapped_index(
                                    i, item.genotype_description.submission.
                                    species, item.closest_reference))
                                annots.append(
                                    (i, 1, 'Possible repeated read error'))
                                break
                            i += 1

                    if len(q_runs) > 0:
                        bt_runs = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="Possible repeated read errors at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(
                            q_runs)

                    # Check for RGYW/WRCY hotspot change

                    ref_qpos = [
                        m.start()
                        for m in re.finditer('[AG][G][CT][AT]', str(ref_nt))
                    ]

                    q_hotspots = []

                    for p in ref_qpos:
                        if seq_nt[p + 1] == 'C':
                            q_hotspots.append("%d" % find_gapped_index(
                                p + 1,
                                item.genotype_description.submission.species,
                                item.closest_reference))
                            annots.append(
                                (p + 1, 1, 'G/C SNP in RGYW hotspot'))

                    ref_qpos = [
                        m.start()
                        for m in re.finditer('[AT][AG][C][CT]', str(ref_nt))
                    ]

                    for p in ref_qpos:
                        if seq_nt[p + 2] == 'G':
                            q_hotspots.append("%d" % find_gapped_index(
                                p + 2,
                                item.genotype_description.submission.species,
                                item.closest_reference))
                            annots.append(
                                (p + 2, 1, 'C/G SNP in WRCY hotspot'))

                    if len(q_hotspots) > 0:
                        bt_hotspots = '<button type="button" class="btn btn-xs text-info icon_back" data-toggle="tooltip" title="G/C SNP in RGYW/WRCY hotspot at IMGT position(s) %s"><span class="glyphicon glyphicon-info-sign"></span>&nbsp;</button>' % ", ".join(
                            q_hotspots)

        bt_view = popup_seq_button(item.sequence_id,
                                   item.nt_sequence,
                                   item.nt_sequence_gapped,
                                   annots=annots)

        return bt_view + bt_check + bt_imgt + bt_igpdb + bt_vdjbase + bt_indels + bt_codon_usage + bt_runs + bt_hotspots + bt_ref_found