コード例 #1
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) != 1:
        raise BadRequest(
            'This report processes a single repertoire-derived genotype')

    if format not in ['pdf', 'html']:
        raise BadRequest('Invalid format requested')

    rep_sample = rep_samples[0]
    html = (format == 'html')

    session = vdjbase_dbs[species][rep_sample['dataset']].session
    primer_trans, gene_subs = find_primer_translations(session)
    print(rep_sample)
    p = session.query(Sample.genotype).filter(
        Sample.sample_name == rep_sample['sample_name']).one_or_none()
    p = p[0].replace('samples/', '')
    sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species,
                               rep_sample['dataset'], p)

    if not os.path.isfile(sample_path):
        raise BadRequest('Genotype file for sample %s/%s is missing' %
                         (rep_sample['dataset'], rep_sample['sample_name']))

    sample_path = check_tab_file(sample_path)

    # translate pipeline allele names to VDJbase allele names
    genotype = pd.read_csv(sample_path, sep='\t', dtype=str)

    for col in ['alleles', 'GENOTYPED_ALLELES']:
        genotype[col] = [
            translate_primer_alleles(x, y, primer_trans)
            for x, y in zip(genotype['gene'], genotype[col])
        ]

    genotype['gene'] = [
        translate_primer_genes(x, gene_subs) for x in genotype['gene']
    ]
    sample_path = make_output_file('tsv')
    genotype.to_csv(sample_path, sep='\t', index=False)

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_order_file(species,
                                     rep_sample['dataset'],
                                     locus_order=locus_order)

    report_path = personal_genotype(rep_sample['sample_name'], sample_path,
                                    rep_sample['pcr_target_locus'],
                                    gene_order_file, html)

    if format == 'pdf':
        attachment_filename = '%s_%s_%s_genotype.pdf' % (
            species, rep_sample['dataset'], rep_sample['sample_name'])
    else:
        attachment_filename = None

    return send_report(report_path, format, attachment_filename)
コード例 #2
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format not in (['pdf', 'html']):
        raise BadRequest('Invalid format requested')

    single_sample_filter = 1 if params[
        'single_sample'] == 'One Selected Sample' else 0
    calc_by_clone = 1 if params['calculate_by'] == 'Number of Clones' else 0
    chain, samples_by_dataset = collate_samples(rep_samples)

    # Format we need to produce is [(gene_name, hetero count, h**o count),...]

    genes_frequencies = defaultdict(list)

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session

        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id)\
                .filter(Sample.sample_name.in_(sample_chunk))\
                .filter(Sample.sample_group >= single_sample_filter)\
                .all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            i = 0
            sample_list_len = len(sample_list)

            frequencies = session.query(GenesDistribution.sample_id, Gene.name, GenesDistribution.frequency)\
                .join(Gene)\
                .join(Sample)\
                .filter(GenesDistribution.count_by_clones == calc_by_clone)\
                .filter(Gene.name.in_(wanted_genes)) \
                .filter(Sample.sample_name.in_(sample_list)) \
                .all()

            for frequency in frequencies:
                genes_frequencies[frequency[1]].append(
                    round(float(frequency[2]), 2))

    labels = ['GENE', 'FREQ']
    genes_frequencies_df = pd.DataFrame(columns=labels)
    for gene, usages in genes_frequencies.items():
        genes_frequencies_df = genes_frequencies_df.append(
            {
                'GENE': gene,
                'FREQ': ",".join([str(x) for x in usages])
            },
            ignore_index=True)

    input_path = make_output_file('tab')
    genes_frequencies_df.to_csv(input_path, sep='\t', index=False)

    output_path = make_output_file(format)
    attachment_filename = '%s_gene_frequency.pdf' % species

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_multiple_order_file(species,
                                              samples_by_dataset.keys(),
                                              locus_order=locus_order)

    cmd_line = [
        "-i", input_path, "-o", output_path, "-t",
        'T' if format == 'html' else 'F', "-c", chain, "-g", gene_order_file
    ]

    if run_rscript(GENE_FREQUENCY_PLOT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format, attachment_filename)
    else:
        raise BadRequest('No output from report')
コード例 #3
0
def run(format, species, genomic_datasets, genomic_subjects, rep_datasets,
        rep_samples, params):
    if len(genomic_subjects) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if 'Sample info' in params['type']:
        headers = genomic_subject_filters.keys()

        attribute_query = []
        for name, filter in genomic_subject_filters.items():
            if filter['model'] is not None:
                attribute_query.append(filter['field'])

        rows = find_genomic_subjects(attribute_query, species,
                                     genomic_datasets, params['filters'])

        outfile = make_output_file('csv')
        with open(outfile, 'w', newline='') as fo:
            writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers)
            writer.writeheader()
            for row in rows:
                writer.writerow(row)

        return send_report(outfile,
                           'csv',
                           attachment_filename='sample_info.csv')

    elif 'Sample files' in params['type']:
        outfile = make_output_file('zip')
        with zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED) as fo:
            added_dirs = []
            sample_paths = find_genomic_subjects([Subject.annotation_path],
                                                 species, genomic_datasets,
                                                 params['filters'])
            sample_paths = [
                '/'.join([
                    'study_data/Genomic',
                    s['annotation_path'].split('Genomic')[1]
                ]) for s in sample_paths
            ]
            sample_paths = [
                os.path.join(app.config['STATIC_PATH'], s)
                for s in sample_paths
            ]
            for sample_path in sample_paths:
                sample_dir = os.path.dirname(sample_path)
                if sample_dir not in added_dirs:
                    zipdir(sample_dir, fo,
                           app.config['STATIC_PATH'])  # sample files
                    added_dirs.append(sample_dir)

        return send_report(outfile,
                           'zip',
                           attachment_filename='sample_data.zip')

    elif 'Ungapped' in params['type'] or 'Gapped' in params['type']:
        seq_name = 'sequence' if 'Ungapped' in params[
            'type'] else 'gapped_sequence'
        required_cols = ['name', seq_name, 'dataset']
        seqs = find_genomic_sequences(required_cols, genomic_datasets, species,
                                      params['filters'])

        recs = []
        for seq in seqs:
            if len(seq[seq_name]) > 0:
                id = '%s|%s|%s' % (seq['name'], species, seq['dataset'])
                recs.append(
                    SeqRecord(Seq(seq[seq_name]), id=id, description=''))

        outfile = make_output_file('fasta')
        SeqIO.write(recs, outfile, "fasta")
        return send_report(outfile,
                           'fasta',
                           attachment_filename='%s_sequences.fasta' % species)

    elif 'Gene info' in params['type']:
        headers = genomic_sequence_filters.keys()
        rows = find_genomic_sequences(headers, genomic_datasets, species,
                                      params['filters'])

        outfile = make_output_file('csv')
        with open(outfile, 'w', newline='') as fo:
            writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers)
            writer.writeheader()
            for row in rows:
                writer.writerow(row)

        return send_report(outfile,
                           'csv',
                           attachment_filename='sequence_info.csv')

    raise BadRequest('No output from report')
コード例 #4
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if format != 'pdf' and format != 'xls':
        raise BadRequest('Invalid format requested')

    rep_samples_by_dataset = {}
    for rep_sample in rep_samples:
        if rep_sample['dataset'] not in rep_samples_by_dataset:
            rep_samples_by_dataset[rep_sample['dataset']] = []
        rep_samples_by_dataset[rep_sample['dataset']].append(
            rep_sample['sample_name'])

    # Format we need to produce is [gene_name, [allele names], [allele appearances], gene appearances]
    # Start with a dict indexed by gene, then convert to appropriately sorted list
    counts = {}

    for dataset in rep_samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        appearances = []

        for sample_chunk in chunk_list(rep_samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            app_query = session.query(AllelesSample.patient_id, Patient.patient_name, Gene.name, Allele.name, Sample.sample_name, Gene.locus_order, Gene.alpha_order)\
                                .filter(Sample.id == AllelesSample.sample_id)\
                                .filter(Allele.id == AllelesSample.allele_id)\
                                .filter(Gene.id == Allele.gene_id)\
                                .filter(Patient.id == AllelesSample.patient_id)\
                                .filter(Sample.sample_name.in_(sample_list))\
                                .filter(Gene.name.in_(wanted_genes))

            if params['novel_alleles'] == 'Exclude':
                app_query = app_query.filter(Allele.novel == 0)

            if params['ambiguous_alleles'] == 'Exclude':
                app_query = app_query.filter(Allele.is_single_allele == 1)

            appearances.extend(app_query.all())

        for app in appearances:
            _, patient_name, gene, allele, sample, locus_order, alpha_order = app
            allele = allele.split('*', 1)[1].upper()
            if gene not in counts:
                if params['sort_order'] == 'Alphabetic':
                    counts[gene] = [{}, [], alpha_order]
                else:
                    counts[gene] = [{}, [], locus_order]
            if allele not in counts[gene][0]:
                counts[gene][0][allele] = []
            if patient_name not in counts[gene][0][allele]:
                counts[gene][0][allele].append(patient_name)
            if patient_name not in counts[gene][1]:
                counts[gene][1].append(patient_name)

    gen_samples_by_dataset = {}
    for gen_sample in genomic_samples:
        if gen_sample['dataset'] not in gen_samples_by_dataset:
            gen_samples_by_dataset[gen_sample['dataset']] = []
        gen_samples_by_dataset[gen_sample['dataset']].append(
            gen_sample['identifier'])

    for dataset in gen_samples_by_dataset.keys():
        session = genomic_dbs[species][dataset].session
        appearances = []

        for sample_chunk in chunk_list(gen_samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(GenomicSubject.identifier).filter(
                GenomicSubject.identifier.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            app_query = session.query(GenomicSubject.identifier,
                                      GenomicGene.name,
                                      GenomicSequence.name,
                                      GenomicGene.locus_order,
                                      GenomicGene.alpha_order)\
                .filter(GenomicSubject.id == GenomicSubjectSequence.subject_id) \
                .filter(GenomicSequence.id == GenomicSubjectSequence.sequence_id) \
                .filter(GenomicGene.id == GenomicSequence.gene_id) \
                .filter(GenomicSequence.type.in_(['V-REGION', 'D-REGION', 'J-REGION'])) \
                .filter(GenomicSubject.identifier.in_(sample_list))\
                .filter(GenomicGene.name.in_(wanted_genes))

            if params['novel_alleles'] == 'Exclude':
                app_query = app_query.filter(GenomicSequence.novel == 0)
            appearances.extend(app_query.all())

        for app in appearances:
            patient_name, gene, allele, locus_order, alpha_order = app
            allele = allele.split('*', 1)[1].upper()
            if gene not in counts:
                if params['sort_order'] == 'Alphabetic':
                    counts[gene] = [{}, [], alpha_order]
                else:
                    counts[gene] = [{}, [], locus_order]
            if allele not in counts[gene][0]:
                counts[gene][0][allele] = []
            if patient_name not in counts[gene][0][allele]:
                counts[gene][0][allele].append(patient_name)
            if patient_name not in counts[gene][1]:
                counts[gene][1].append(patient_name)

    single_alleles = []
    multi_alleles = []

    for gene, (alleles, total, order) in counts.items():
        row = [
            gene,
            sorted(list(alleles.keys())),
            [len(alleles[a]) for a in sorted(alleles.keys())],
            len(total), order
        ]
        if len(alleles) > 1:
            multi_alleles.append(row)
        else:
            single_alleles.append(row)

    multi_alleles.sort(key=lambda row: row[4])
    multi_alleles = [m[:4] for m in multi_alleles]
    single_alleles.sort(key=lambda row: row[4])

    s = ['Single allele genes', [], []]
    for (gene, alleles, counts, _, _) in single_alleles:
        s[1].append(gene + '\n' + alleles[0])
        s[2].append(counts[0])

    multi_alleles.append(s)

    input_path = make_output_file('xls')
    output_path = make_output_file('pdf')
    book = xlwt.Workbook()

    for row in multi_alleles:
        if len(row[1]) > 0:
            write_gene(book, row)

    book.save(input_path)

    if format == 'xls':
        return send_report(input_path, format,
                           '%s_allele_appearance.xls' % species)

    cmd_line = ["-i", input_path, "-o", output_path]

    if run_rscript(APPEARANCE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format,
                           '%s_allele_appearance.pdf' % species)
    else:
        raise BadRequest('No output from report')
コード例 #5
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if format != 'xls':
        raise BadRequest('Invalid format requested')

    rep_samples_by_dataset = {}
    for rep_sample in rep_samples:
        if rep_sample['dataset'] not in rep_samples_by_dataset:
            rep_samples_by_dataset[rep_sample['dataset']] = []
        rep_samples_by_dataset[rep_sample['dataset']].append(
            rep_sample['sample_name'])

    imgt_refs = {}
    gene_order = {}
    sequences = {}
    all_wanted_genes = []

    for dataset in rep_samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session

        refs = session.query(Allele).all()

        for ref in refs:
            if ref.novel == 0 and ref.name not in imgt_refs:
                imgt_refs[ref.name] = ref.seq.replace('.', '')

            if ref.name not in sequences:
                sequences[ref.name.upper()] = ref.seq.replace('.', '').lower()

        genes = session.query(Gene).all()

        for gene in genes:
            if gene.name not in gene_order:
                if params['sort_order'] == 'Alphabetic':
                    gene_order[gene.name] = gene.alpha_order
                else:
                    gene_order[gene.name] = gene.locus_order

    rep_counts = {}

    for dataset in rep_samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        appearances = []

        for sample_chunk in chunk_list(rep_samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(Sample.sample_name).filter(
                Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            all_wanted_genes.extend(wanted_genes)
            sample_list = [s[0] for s in sample_list]

            app_query = session.query(AllelesSample.patient_id, Gene.name, Allele.name, Sample.sample_name, Patient.patient_name)\
                                .filter(Sample.id == AllelesSample.sample_id)\
                                .filter(Allele.id == AllelesSample.allele_id)\
                                .filter(Gene.id == Allele.gene_id)\
                                .filter(Patient.id == AllelesSample.patient_id)\
                                .filter(Sample.sample_name.in_(sample_list))\
                                .filter(Gene.name.in_(wanted_genes))

            if params['novel_alleles'] == 'Exclude':
                app_query = app_query.filter(Allele.novel == 0)

            if params['ambiguous_alleles'] == 'Exclude':
                app_query = app_query.filter(Allele.is_single_allele == 1)

            appearances.extend(app_query.all())

        for app in appearances:
            pid, gene, allele, sample, patient_name = app
            allele = allele.split('*', 1)[1].upper()
            if gene not in rep_counts:
                rep_counts[gene] = [{}, []]
            if allele not in rep_counts[gene][0]:
                rep_counts[gene][0][allele] = []
            if patient_name not in rep_counts[gene][0][allele]:
                rep_counts[gene][0][allele].append(patient_name)
            if patient_name not in rep_counts[gene][1]:
                rep_counts[gene][1].append(patient_name)

    gen_samples_by_dataset = {}
    for gen_sample in genomic_samples:
        if gen_sample['dataset'] not in gen_samples_by_dataset:
            gen_samples_by_dataset[gen_sample['dataset']] = []
        gen_samples_by_dataset[gen_sample['dataset']].append(
            gen_sample['identifier'])

    for dataset in gen_samples_by_dataset.keys():
        session = genomic_dbs[species][dataset].session

        refs = session.query(GenomicSequence).all()

        for ref in refs:
            if ref.novel == 0 and ref.name not in imgt_refs:
                imgt_refs[ref.name] = ref.sequence.replace('.', '')

            if ref.name not in sequences:
                sequences[ref.name.upper()] = ref.sequence.replace('.',
                                                                   '').lower()

        genes = session.query(GenomicGene).all()

        for gene in genes:
            if gene.name not in gene_order:
                if params['sort_order'] == 'Alphabetic':
                    gene_order[gene.name] = gene.alpha_order
                else:
                    gene_order[gene.name] = gene.locus_order

    gen_counts = {}

    for dataset in gen_samples_by_dataset.keys():
        session = genomic_dbs[species][dataset].session
        appearances = []

        for sample_chunk in chunk_list(gen_samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(GenomicSubject.identifier).filter(
                GenomicSubject.identifier.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            all_wanted_genes.extend(wanted_genes)
            sample_list = [s[0] for s in sample_list]

            app_query = session.query(GenomicSubject.id,
                                      GenomicSubject.identifier,
                                      GenomicSubject.sequencing_platform,
                                      GenomicSubject.capture_probes,
                                      GenomicSequence.name,
                                      GenomicGene.name) \
                .filter(GenomicSubject.id == GenomicSubjectSequence.subject_id) \
                .filter(GenomicSequence.id == GenomicSubjectSequence.sequence_id) \
                .filter(GenomicSequence.type.in_(['V-REGION', 'D-REGION', 'J-REGION'])) \
                .filter(GenomicGene.id == GenomicSequence.gene_id)\
                .filter(GenomicSubject.identifier.in_(sample_list))\
                .filter(GenomicGene.name.in_(wanted_genes))

            if params['novel_alleles'] == 'Exclude':
                app_query = app_query.filter(GenomicSequence.novel == 0)

            appearances.extend(app_query.all())

        for app in appearances:
            _, patient_name, platform, probes, allele, gene = app
            allele = allele.split('*', 1)[1].upper()
            if gene not in gen_counts:
                gen_counts[gene] = [{}, [], {}, {}]
            if allele not in gen_counts[gene][0]:
                gen_counts[gene][0][allele] = []
                gen_counts[gene][2][allele] = []
                gen_counts[gene][3][allele] = []
            if patient_name not in gen_counts[gene][0][allele]:
                gen_counts[gene][0][allele].append(patient_name)
            if patient_name not in gen_counts[gene][1]:
                gen_counts[gene][1].append(patient_name)
            if platform and platform not in gen_counts[gene][2][allele]:
                gen_counts[gene][2][allele].append(platform)
            if probes and probes not in gen_counts[gene][3][allele]:
                gen_counts[gene][3][allele].append(probes)

    imgt_counts = {}
    all_wanted_genes = list(set(all_wanted_genes))

    for ref in imgt_refs.keys():
        ref = ref.upper()
        gene, allele = ref.split('*')
        if gene in all_wanted_genes:
            if gene not in imgt_counts:
                imgt_counts[gene] = [{}, [1]]
            if allele not in imgt_counts[gene][0] and allele != 'DEL':
                imgt_counts[gene][0][allele] = [1]

    headers = ['Allele', 'IMGT', 'AIRR-Seq', 'Genomic']

    genes_in_order = sorted(gene_order.items(), key=lambda x: x[1])
    genes_in_order = [g[0] for g in genes_in_order]

    results = []

    for gene in genes_in_order:
        # Assemble the set of alleles to list for this gene
        ref_alleles = []
        novel_alleles = []

        for counts in [imgt_counts, rep_counts, gen_counts]:
            if gene in counts:
                for allele in counts[gene][0].keys():
                    if '_' in allele:
                        if allele not in novel_alleles:
                            novel_alleles.append(allele)
                    else:
                        if allele not in ref_alleles:
                            ref_alleles.append(allele)

        ref_alleles.sort()
        novel_alleles.sort()

        ref_alleles.extend(novel_alleles)

        def allele_count(gene, allele, counts):
            if gene not in counts:
                return 0
            if allele not in counts[gene][0]:
                return 0
            return len(counts[gene][0][allele])

        def best_platform(gene, allele, counts):
            platforms = ['RS', 'SEQUEL', 'SEQUELII']
            if gene not in counts:
                return ''
            if allele not in counts[gene][2]:
                return ''

            best = ''
            for platform in platforms:
                if platform in counts[gene][2][allele]:
                    best = platform

            return best

        def best_probes(gene, allele, counts):
            probes = ['V2', 'V3']
            if gene not in counts:
                return ''
            if allele not in counts[gene][3]:
                return ''

            best = ''
            for probe in probes:
                if probe in counts[gene][3][allele]:
                    best = probe

            return best

        for allele in ref_alleles:
            row = {
                'Allele': f'{gene}*{allele}',
                'IMGT': allele_count(gene, allele, imgt_counts),
                'AIRR-Seq': allele_count(gene, allele, rep_counts),
                'Genomic': allele_count(gene, allele, gen_counts),
                'Best platform': best_platform(gene, allele, gen_counts),
                'Best probes': best_probes(gene, allele, gen_counts),
                'Sequence': sequences[f'{gene}*{allele}'.upper()]
            }
            results.append(row)

    output_path = make_output_file('csv')
    write_csv(output_path, results)
    return send_report(output_path, 'csv', f'{species}_allele_usage.csv')
コード例 #6
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format not in ['pdf', 'html']:
        raise BadRequest('Invalid format requested')

    html = (format == 'html')
    chain, samples_by_dataset = collate_samples(rep_samples)
    genotypes = pd.DataFrame()

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        primer_trans, gene_subs = find_primer_translations(session)

        sample_list = []
        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list.extend(
                session.query(Sample.sample_name, Sample.genotype,
                              Sample.patient_id).filter(
                                  Sample.sample_name.in_(sample_chunk)).all())

        sample_list, wanted_genes = apply_rep_filter_params(
            params, sample_list, session)

        if len(wanted_genes) > 0:
            for (name, genotype, patient_id) in sample_list:
                sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species,
                                           dataset,
                                           genotype.replace('samples/', ''))

                if not os.path.isfile(sample_path):
                    continue

                genotype = pd.read_csv(sample_path, sep='\t', dtype=str)

                genotype = trans_df(genotype)

                # translate pipeline allele names to VDJbase allele names
                for col in ['alleles', 'GENOTYPED_ALLELES']:
                    genotype[col] = [
                        translate_primer_alleles(x, y, primer_trans)
                        for x, y in zip(genotype['gene'], genotype[col])
                    ]

                genotype['gene'] = [
                    translate_primer_genes(x, gene_subs)
                    for x in genotype['gene']
                ]
                genotype = genotype[genotype.gene.isin(wanted_genes)]

                subject_name = name if len(
                    samples_by_dataset) == 1 else dataset + '_' + name

                if 'subject' not in genotype.columns.values:
                    genotype.insert(0, 'subject', subject_name)
                else:
                    genotype.subject = subject_name

                genotypes = genotypes.append(genotype)[
                    genotype.columns.tolist()]

    if len(genotypes) == 0:
        raise BadRequest('No records matching the filter criteria were found.')

    geno_path = make_output_file('csv')
    genotypes.to_csv(geno_path, sep='\t')

    if format == 'pdf':
        attachment_filename = '%s_genotype.pdf' % species
    else:
        attachment_filename = None

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_multiple_order_file(species,
                                              samples_by_dataset.keys(),
                                              locus_order=locus_order)

    output_path = make_output_file('html' if html else 'pdf')
    file_type = 'T' if html else 'F'
    cmd_line = [
        "-i", geno_path, "-o", output_path, "-t", file_type, "-k",
        str(params['f_kdiff']), "-c", chain, "-g", gene_order_file
    ]

    if run_rscript(HEATMAP_GENOTYPE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format, attachment_filename)
    else:
        raise BadRequest('No output from report')
コード例 #7
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format != 'pdf':
        raise BadRequest('Invalid format requested')

    html = (format == 'html')

    chain, samples_by_dataset = collate_samples(rep_samples)
    haplotypes = pd.DataFrame()

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        primer_trans, gene_subs = find_primer_translations(session)

        haplos = []
        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]
            haplo_query = session.query(Sample.sample_name, HaplotypesFile.file)\
                .filter(Sample.sample_name.in_(sample_list))\
                .join(SamplesHaplotype, Sample.id == SamplesHaplotype.samples_id)\
                .filter(SamplesHaplotype.haplotypes_file_id == HaplotypesFile.id)\
                .filter(HaplotypesFile.by_gene == params['haplo_gene'])
            haplos.extend(haplo_query.all())

        for name, filename in haplos:
            sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset,
                                       filename.replace('samples/', ''))

            if not os.path.isfile(sample_path):
                raise BadRequest('Haplotype file %s is missing.' %
                                 (sample_path))

            haplotype = pd.read_csv(sample_path, sep='\t', dtype=str)
            haplotype = trans_df(haplotype)
            haplotype['subject'] = name if len(
                samples_by_dataset) == 1 else dataset + '_' + name

            # translate pipeline allele names to VDJbase allele names

            col_names = list(haplotype.columns.values)
            for i in (2, 3, 4):
                haplotype[col_names[i]] = [
                    translate_primer_alleles(x, y, primer_trans)
                    for x, y in zip(haplotype['gene'], haplotype[col_names[i]])
                ]

            haplotype['gene'] = [
                translate_primer_genes(x, gene_subs) for x in haplotype['gene']
            ]
            haplotype = haplotype[haplotype.gene.isin(wanted_genes)]

            haplotypes = pd.concat(
                [haplotypes, haplotype], keys=None,
                ignore_index=True)[haplotype.columns.tolist()]

    if len(haplotypes) == 0:
        raise BadRequest('No records matching the filter criteria were found.')

    haplo_path = make_output_file('tsv')
    haplotypes.to_csv(haplo_path, sep='\t', index=False)
    attachment_filename = '%s_haplotype_heatmap.pdf' % species

    if not params['f_kdiff'] or params['f_kdiff'] == '':
        params['f_kdiff'] = 0

    locus_order = ('sort_order' in params and params['sort_order'] == 'Locus')
    gene_order_file = get_multiple_order_file(species,
                                              samples_by_dataset.keys(),
                                              locus_order=locus_order)
    output_path = make_output_file('html' if html else 'pdf')
    cmd_line = [
        "-i", haplo_path, "-o", output_path, "-k",
        str(params['f_kdiff']), "-c", chain, "-g", gene_order_file
    ]

    if run_rscript(HEATMAP_HAPLOTYPE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format, attachment_filename)
    else:
        raise BadRequest('No output from report')
コード例 #8
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format != 'html':
        raise BadRequest('Invalid format requested')

    kdiff = float(params['f_kdiff']
                  ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0
    chain, samples_by_dataset = collate_samples(rep_samples)

    if len(samples_by_dataset
           ) > 1 and params['ambiguous_alleles'] != 'Exclude':
        raise BadRequest(
            'Ambiguous alleles cannot be processed across multiple datasets')

    # Format we need to produce is [(gene_name, hetero count, h**o count),...]

    gene_allele_counts = {}

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        allele_recs = []

        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            query = session.query(Gene.name, Allele.id, Gene.type) \
                .join(Allele) \
                .join(AllelesSample) \
                .join(Sample) \
                .join(Patient, Patient.id == Sample.patient_id) \
                .filter(Gene.name.in_(wanted_genes)) \
                .filter(Allele.name.notlike('%Del%')) \
                .filter(Allele.name.notlike('%OR%')) \
                .filter(Sample.sample_name.in_(sample_list)) \
                .filter(AllelesSample.kdiff >= kdiff)

            if 'sort_order' in params and params['sort_order'] == 'Locus':
                query = query.order_by(Gene.locus_order, Patient.id, Allele.id)
            else:
                query = query.order_by(Gene.alpha_order, Patient.id, Allele.id)

            if params['novel_alleles'] == 'Exclude':
                query = query.filter(Allele.novel == 0)

            if params['ambiguous_alleles'] == 'Exclude':
                query = query.filter(Allele.is_single_allele == 1)

            allele_recs.extend(query.all())

        i = 0
        while i < len(allele_recs):
            (gene_name, allele_id, gene_type) = allele_recs[i]
            gene_allele_ids = []

            while i < len(allele_recs):
                if allele_recs[i][0] != gene_name:
                    break

                allele_id = allele_recs[i][1]
                gene_allele_ids.append(allele_id)
                i += 1

            gene_allele_ids = set(gene_allele_ids)

            # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one,
            # drop the unambiguous one because it is already counted

            if (params['ambiguous_alleles'] != 'Exclude'):
                patterns = session.query(AllelesPattern.pattern_id)\
                    .filter(AllelesPattern.allele_in_p_id.in_(gene_allele_ids))\
                    .filter(AllelesPattern.pattern_id.in_(gene_allele_ids))\
                    .all()

                if patterns is not None and len(patterns) > 0:
                    patterns = set([pattern[0] for pattern in patterns])
                    gene_allele_ids = gene_allele_ids - patterns

            if gene_name not in gene_allele_counts:
                gene_allele_counts[gene_name] = gene_allele_ids
            else:
                gene_allele_counts[gene_name] |= gene_allele_ids

    listed_allele_count = []
    for gene, alleles in gene_allele_counts.items():
        listed_allele_count.append((gene, len(alleles)))

    labels = ['GENE', 'COUNT']
    input_path = make_output_file('tab')
    df = pd.DataFrame(listed_allele_count, columns=labels)
    df.to_csv(input_path, sep='\t', index=False)
    output_path = make_output_file('html')

    cmd_line = ["-i", input_path, "-o", output_path, "-c", chain]

    if run_rscript(ALLELE_USAGE_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format)
    else:
        raise BadRequest('No output from report')
コード例 #9
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params):
    datasets = []
    for sample in rep_samples:
        if sample['dataset'] not in datasets:
            datasets.append(sample['dataset'])

    if len(datasets) != 2:
        raise BadRequest('Please select exactly two AIRR-seq datasets to compare.')

    if format != 'html':
        raise BadRequest('Invalid format requested')

    output_path = make_output_file('html')

    with open(output_path, 'w') as fo:
        session = []
        alleles = []
        allele_names = []
        allele_similars = []
        allele_lookups = [{}, {}]

        for i in (0, 1):
            session.append(vdjbase_dbs[species][datasets[i]].session)
            alleles.append(session[i].query(Allele).all())
            allele_names.append(set([allele.name for allele in alleles[i]]))

            allele_similars.append({})
            for allele in alleles[i]:
                if allele.similar is not None:
                    sims = [x.replace('|', '') for x in allele.similar.split(', ')]
                    for sim in sims:
                        allele_similars[i][sim] = allele.name

                allele_lookups[i][allele.name] = allele

        common_allele_names = list(allele_names[0] & allele_names[1])

        fo.write('<h2>Comparison of %s and %s</h2>' % (datasets[0], datasets[1]))

        fo.write('<h2>Alleles only in %s</h2>' % datasets[0])
        exc = list(allele_names[0] - allele_names[1])
        exc_com = [('%s (%s in %s)' % (x, allele_similars[1][x], datasets[1]) if x in allele_similars[1] else x) for x in exc]
        fo.write('<br>'.join(exc_com))

        fo.write('<h2>Alleles only in %s</h2>' % datasets[1])
        exc = list(allele_names[1] - allele_names[0])
        exc_com = [('%s (%s in %s)' % (x, allele_similars[0][x], datasets[0]) if x in allele_similars[0] else x) for x in exc]
        fo.write('<br>'.join(exc_com))

        fo.write('<h2>Changed appearance counts</h2>')
        fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1]))

        for allele in common_allele_names:
            if allele_lookups[0][allele].appears != allele_lookups[1][allele].appears:
                fo.write('<tr><th>%s</th><th>%d</th><th>%d</th></tr>' % (allele, allele_lookups[0][allele].appears, allele_lookups[1][allele].appears))
        fo.write('</table>')

        fo.write('<h2>Changed max_kdiffs</h2>')
        fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1]))

        for allele in common_allele_names:
            if abs(allele_lookups[0][allele].max_kdiff - allele_lookups[1][allele].max_kdiff) > 0.1:
                fo.write('<tr><th>%s</th><th>%.2f</th><th>%.2f</th></tr>' % (allele, allele_lookups[0][allele].max_kdiff, allele_lookups[1][allele].max_kdiff))
        fo.write('</table>')

        fo.write('<h2>Changed confidence levels</h2>')
        fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1]))

        for allele in common_allele_names:
            if allele_lookups[0][allele].low_confidence != allele_lookups[1][allele].low_confidence:
                fo.write('<tr><th>%s</th><th>%s</th><th>%s</th></tr>' % (allele, 'low' if allele_lookups[0][allele].low_confidence else 'high', 'low' if allele_lookups[1][allele].low_confidence else 'high'))
        fo.write('</table>')

        fo.write('<h2>Changed number of notes</h2>')
        fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1]))

        for allele in common_allele_names:
            c0 = session[0].query(AlleleConfidenceReport).filter(AlleleConfidenceReport.allele_id == allele_lookups[0][allele].id).count()
            c1 = session[1].query(AlleleConfidenceReport).filter(AlleleConfidenceReport.allele_id == allele_lookups[1][allele].id).count()
            if c0 != c1:
                fo.write('<tr><th>%s</th><th>%d</th><th>%d</th></tr>' % (allele, c0, c1))
        fo.write('</table>')

    return send_report(output_path, 'html')
コード例 #10
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if 'Sample info' in params['type']:
        samples_by_dataset = {}
        for rep_sample in rep_samples:
            if rep_sample['dataset'] not in samples_by_dataset:
                samples_by_dataset[rep_sample['dataset']] = []
            samples_by_dataset[rep_sample['dataset']].append(rep_sample['sample_name'])

        attribute_query = []
        headers = []

        for name, filter in sample_info_filters.items():

            if filter['model'] is not None:
                attribute_query.append(filter['field'])
                headers.append(name)

        rows = []
        for dataset in samples_by_dataset.keys():
            session = vdjbase_dbs[species][dataset].session

            for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS):
                sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id).filter(Sample.sample_name.in_(sample_chunk)).all()
                sample_list = [s[0] for s in sample_list]

                results = session.query(*attribute_query)\
                    .join(GenoDetection, GenoDetection.id == Sample.geno_detection_id)\
                    .join(Patient, Patient.id == Sample.patient_id)\
                    .join(SeqProtocol)\
                    .join(TissuePro)\
                    .join(Study, Sample.study_id == Study.id)\
                    .filter(Sample.sample_name.in_(sample_list)).all()

                rows.extend(results)

        outfile = make_output_file('csv')
        with open(outfile, 'w', newline='') as fo:
            writer = csv.writer(fo, dialect='excel')
            writer.writerow(headers)
            for row in rows:
                writer.writerow(row)

        return send_report(outfile, 'csv', attachment_filename='sample_info.csv')

    elif 'Sample files' in params['type']:
        outfile = make_output_file('zip')
        with zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED) as fo:
            samples_by_dataset = {}
            for rep_sample in rep_samples:
                if rep_sample['dataset'] not in samples_by_dataset:
                    samples_by_dataset[rep_sample['dataset']] = []
                samples_by_dataset[rep_sample['dataset']].append(rep_sample['sample_name'])

            added_files = []            # handle multiple samples in same dir etc
            added_dirs = []
            for dataset in samples_by_dataset.keys():
                print('adding dataset')
                session = vdjbase_dbs[species][dataset].session
                for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS):
                    sample_list = session.query(Sample.genotype, Sample.igsnper_plot_path).filter(Sample.sample_name.in_(sample_chunk)).all()
                    for p1, p2 in sample_list:
                        if p1 is not None and len(p1) > 0:
                            sample_dir = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, os.path.dirname(p1.replace('samples/', '')))
                            if sample_dir not in added_dirs:
                                zipdir(sample_dir, fo, os.path.join(VDJBASE_SAMPLE_PATH, species))        # sample files
                                added_dirs.append(sample_dir)
                        if p2 is not None and len(p2) > 0:
                            igsnper_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, p2)
                            if igsnper_path not in added_files:
                                fo.write(igsnper_path, arcname=igsnper_path.replace(os.path.join(VDJBASE_SAMPLE_PATH, species), ''))
                                added_files.append(igsnper_path)

        return send_report(outfile, 'zip', attachment_filename='sample_data.zip')

    elif 'Ungapped' in params['type'] or 'Gapped' in params['type']:
        required_cols = ['name', 'seq', 'dataset']
        seqs = find_sequences(params, rep_samples, species, required_cols)

        recs = []
        for seq in seqs:
            id = '%s|%s|%s' % (seq['name'], species, seq['dataset'])
            recs.append(SeqRecord(Seq(seq['seq'] if 'Gapped' in params['type'] else seq['seq'].replace('.', '')), id=id, description=''))

        outfile = make_output_file('fasta')
        SeqIO.write(recs, outfile, "fasta")
        return send_report(outfile, 'fasta', attachment_filename='%s_sequences.fasta' % species)

    elif 'Gene info' in params['type']:
        headers = []
        for name, att_filter in sequence_filters.items():
            if att_filter['model'] is not None:
                headers.append(name)

        headers.append('dataset')
        rows = find_sequences(params, rep_samples, species, headers)

        outfile = make_output_file('csv')
        with open(outfile, 'w', newline='') as fo:
            writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers)
            writer.writeheader()
            for row in rows:
                writer.writerow(row)

        return send_report(outfile, 'csv', attachment_filename='sequence_info.csv')

    raise BadRequest('No output from report')
コード例 #11
0
def run(format, species, genomic_datasets, genomic_samples, rep_datasets,
        rep_samples, params):
    if len(rep_samples) == 0:
        raise BadRequest('No repertoire-derived genotypes were selected.')

    if format != 'html':
        raise BadRequest('Invalid format requested')

    kdiff = float(params['f_kdiff']
                  ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0
    chain, samples_by_dataset = collate_samples(rep_samples)

    # Format we need to produce is [(gene_name, hetero count, h**o count),...]

    gene_hetrozygous_dis = {}

    for dataset in samples_by_dataset.keys():
        session = vdjbase_dbs[species][dataset].session
        allele_sample_recs = []

        for sample_chunk in chunk_list(samples_by_dataset[dataset],
                                       SAMPLE_CHUNKS):
            sample_list = session.query(
                Sample.sample_name, Sample.genotype, Sample.patient_id).filter(
                    Sample.sample_name.in_(sample_chunk)).all()
            sample_list, wanted_genes = apply_rep_filter_params(
                params, sample_list, session)
            sample_list = [s[0] for s in sample_list]

            query = session.query(Gene.name, Patient.id, Allele.id, Sample.sample_name, Gene.locus_order, AllelesSample.kdiff, Allele.name) \
                .join(Allele, Gene.id == Allele.gene_id) \
                .join(AllelesSample, Allele.id == AllelesSample.allele_id) \
                .join(Sample, Sample.id == AllelesSample.sample_id) \
                .join(Patient, Patient.id == Sample.patient_id) \
                .filter(Gene.name.in_(wanted_genes)) \
                .filter(Allele.name.notlike('%Del%')) \
                .filter(Allele.name.notlike('%OR%')) \
                .filter(Sample.sample_name.in_(sample_list)) \
                .filter(AllelesSample.kdiff >= kdiff)

            if 'sort_order' in params and params['sort_order'] == 'Locus':
                query = query.order_by(Gene.locus_order, Patient.id, Allele.id)
            else:
                query = query.order_by(Gene.alpha_order, Patient.id, Allele.id)

            if params['ambiguous_alleles'] == 'Exclude':
                query = query.filter(Allele.is_single_allele == True)

            allele_sample_recs.extend(query.all())

        # As the result is indexed, run over each gene in turn, count the number of alleles found in each patient, update h_counts accordingly

        i = 0
        target_gene = ''

        while i < len(allele_sample_recs):
            target_gene = allele_sample_recs[i][0]
            h_counts = [0, 0]

            while i < len(allele_sample_recs):
                if allele_sample_recs[i][0] != target_gene:
                    break

                target_patient = allele_sample_recs[i][1]
                patient_allele_ids = []

                while i < len(allele_sample_recs):
                    if allele_sample_recs[i][
                            0] != target_gene or allele_sample_recs[i][
                                1] != target_patient:
                        break

                    patient_allele_ids.append(allele_sample_recs[i][2])
                    i += 1

                patient_allele_ids = set(patient_allele_ids)

                # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one,
                # drop the unambiguous one because it is already counted

                if params['ambiguous_alleles'] != 'Exclude':
                    patterns = session.query(AllelesPattern.pattern_id)\
                        .filter(AllelesPattern.allele_in_p_id.in_(patient_allele_ids))\
                        .filter(AllelesPattern.pattern_id.in_(patient_allele_ids))\
                        .all()

                    if patterns is not None and len(patterns) > 0:
                        patterns = set([pattern[0] for pattern in patterns])
                        patient_allele_ids = patient_allele_ids - patterns

                if len(patient_allele_ids) > 1:
                    h_counts[1] += 1
                elif len(patient_allele_ids) > 0:
                    h_counts[0] += 1

            if target_gene not in gene_hetrozygous_dis:
                gene_hetrozygous_dis[target_gene] = (target_gene, h_counts[0],
                                                     h_counts[1])
            else:
                gene_hetrozygous_dis[target_gene] = (
                    target_gene,
                    gene_hetrozygous_dis[target_gene][1] + h_counts[0],
                    gene_hetrozygous_dis[target_gene][2] + h_counts[1])

    haplo_path = make_output_file('tab')
    labels = ['GENE', 'HM', 'HT']
    df = pd.DataFrame(gene_hetrozygous_dis.values(), columns=labels)
    df.to_csv(haplo_path, sep='\t', index=False)
    output_path = make_output_file('html')

    cmd_line = [
        "-i",
        haplo_path,
        "-o",
        output_path,
        "-c",
        chain,
    ]

    if run_rscript(HETEROZYGOSITY_SCRIPT, cmd_line) and os.path.isfile(
            output_path) and os.path.getsize(output_path) != 0:
        return send_report(output_path, format)
    else:
        raise BadRequest('No output from report')