def get_multiple_order_file(species, datasets, locus_order=True): gene_order = [] added = False for dataset in datasets: with open(get_order_file(species, dataset, locus_order), 'r') as fi: if len(gene_order) == 0: gene_order = fi.read().split('\n') else: this_order = fi.read().split('\n') prev = '' for gene in this_order: if gene not in gene_order: if prev in gene_order: gene_order.insert(gene_order.index(prev), gene) else: gene_order.append(gene) added = True if added: file_name = make_output_file('tsv') with open(file_name, 'w') as fo: fo.write('\n'.join(gene_order)) else: print(list(datasets)) file_name = get_order_file(species, list(datasets)[0], locus_order) return file_name
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) != 1: raise BadRequest( 'This report processes a single repertoire-derived genotype') if format not in ['pdf', 'html']: raise BadRequest('Invalid format requested') rep_sample = rep_samples[0] html = (format == 'html') session = vdjbase_dbs[species][rep_sample['dataset']].session primer_trans, gene_subs = find_primer_translations(session) print(rep_sample) p = session.query(Sample.genotype).filter( Sample.sample_name == rep_sample['sample_name']).one_or_none() p = p[0].replace('samples/', '') sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, rep_sample['dataset'], p) if not os.path.isfile(sample_path): raise BadRequest('Genotype file for sample %s/%s is missing' % (rep_sample['dataset'], rep_sample['sample_name'])) sample_path = check_tab_file(sample_path) # translate pipeline allele names to VDJbase allele names genotype = pd.read_csv(sample_path, sep='\t', dtype=str) for col in ['alleles', 'GENOTYPED_ALLELES']: genotype[col] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(genotype['gene'], genotype[col]) ] genotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in genotype['gene'] ] sample_path = make_output_file('tsv') genotype.to_csv(sample_path, sep='\t', index=False) locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_order_file(species, rep_sample['dataset'], locus_order=locus_order) report_path = personal_genotype(rep_sample['sample_name'], sample_path, rep_sample['pcr_target_locus'], gene_order_file, html) if format == 'pdf': attachment_filename = '%s_%s_%s_genotype.pdf' % ( species, rep_sample['dataset'], rep_sample['sample_name']) else: attachment_filename = None return send_report(report_path, format, attachment_filename)
def personal_genotype(sample_name, genotype_file, chain, gene_order_file, html=True): output_path = make_output_file('html' if html else 'pdf') file_type = 'T' if html else 'F' cmd_line = [ "-i", genotype_file, "-o", output_path, "-t", file_type, "--samp", sample_name, "-g", gene_order_file, "-c", chain ] if run_rscript(MULTIPLE_GENOTYPE_SCRIPT, cmd_line) and os.path.getsize(output_path) > 0: return output_path else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format not in (['pdf', 'html']): raise BadRequest('Invalid format requested') single_sample_filter = 1 if params[ 'single_sample'] == 'One Selected Sample' else 0 calc_by_clone = 1 if params['calculate_by'] == 'Number of Clones' else 0 chain, samples_by_dataset = collate_samples(rep_samples) # Format we need to produce is [(gene_name, hetero count, h**o count),...] genes_frequencies = defaultdict(list) for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id)\ .filter(Sample.sample_name.in_(sample_chunk))\ .filter(Sample.sample_group >= single_sample_filter)\ .all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] i = 0 sample_list_len = len(sample_list) frequencies = session.query(GenesDistribution.sample_id, Gene.name, GenesDistribution.frequency)\ .join(Gene)\ .join(Sample)\ .filter(GenesDistribution.count_by_clones == calc_by_clone)\ .filter(Gene.name.in_(wanted_genes)) \ .filter(Sample.sample_name.in_(sample_list)) \ .all() for frequency in frequencies: genes_frequencies[frequency[1]].append( round(float(frequency[2]), 2)) labels = ['GENE', 'FREQ'] genes_frequencies_df = pd.DataFrame(columns=labels) for gene, usages in genes_frequencies.items(): genes_frequencies_df = genes_frequencies_df.append( { 'GENE': gene, 'FREQ': ",".join([str(x) for x in usages]) }, ignore_index=True) input_path = make_output_file('tab') genes_frequencies_df.to_csv(input_path, sep='\t', index=False) output_path = make_output_file(format) attachment_filename = '%s_gene_frequency.pdf' % species locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) cmd_line = [ "-i", input_path, "-o", output_path, "-t", 'T' if format == 'html' else 'F', "-c", chain, "-g", gene_order_file ] if run_rscript(GENE_FREQUENCY_PLOT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_subjects, rep_datasets, rep_samples, params): if len(genomic_subjects) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if 'Sample info' in params['type']: headers = genomic_subject_filters.keys() attribute_query = [] for name, filter in genomic_subject_filters.items(): if filter['model'] is not None: attribute_query.append(filter['field']) rows = find_genomic_subjects(attribute_query, species, genomic_datasets, params['filters']) outfile = make_output_file('csv') with open(outfile, 'w', newline='') as fo: writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers) writer.writeheader() for row in rows: writer.writerow(row) return send_report(outfile, 'csv', attachment_filename='sample_info.csv') elif 'Sample files' in params['type']: outfile = make_output_file('zip') with zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED) as fo: added_dirs = [] sample_paths = find_genomic_subjects([Subject.annotation_path], species, genomic_datasets, params['filters']) sample_paths = [ '/'.join([ 'study_data/Genomic', s['annotation_path'].split('Genomic')[1] ]) for s in sample_paths ] sample_paths = [ os.path.join(app.config['STATIC_PATH'], s) for s in sample_paths ] for sample_path in sample_paths: sample_dir = os.path.dirname(sample_path) if sample_dir not in added_dirs: zipdir(sample_dir, fo, app.config['STATIC_PATH']) # sample files added_dirs.append(sample_dir) return send_report(outfile, 'zip', attachment_filename='sample_data.zip') elif 'Ungapped' in params['type'] or 'Gapped' in params['type']: seq_name = 'sequence' if 'Ungapped' in params[ 'type'] else 'gapped_sequence' required_cols = ['name', seq_name, 'dataset'] seqs = find_genomic_sequences(required_cols, genomic_datasets, species, params['filters']) recs = [] for seq in seqs: if len(seq[seq_name]) > 0: id = '%s|%s|%s' % (seq['name'], species, seq['dataset']) recs.append( SeqRecord(Seq(seq[seq_name]), id=id, description='')) outfile = make_output_file('fasta') SeqIO.write(recs, outfile, "fasta") return send_report(outfile, 'fasta', attachment_filename='%s_sequences.fasta' % species) elif 'Gene info' in params['type']: headers = genomic_sequence_filters.keys() rows = find_genomic_sequences(headers, genomic_datasets, species, params['filters']) outfile = make_output_file('csv') with open(outfile, 'w', newline='') as fo: writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers) writer.writeheader() for row in rows: writer.writerow(row) return send_report(outfile, 'csv', attachment_filename='sequence_info.csv') raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if format != 'pdf' and format != 'xls': raise BadRequest('Invalid format requested') rep_samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in rep_samples_by_dataset: rep_samples_by_dataset[rep_sample['dataset']] = [] rep_samples_by_dataset[rep_sample['dataset']].append( rep_sample['sample_name']) # Format we need to produce is [gene_name, [allele names], [allele appearances], gene appearances] # Start with a dict indexed by gene, then convert to appropriately sorted list counts = {} for dataset in rep_samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(rep_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] app_query = session.query(AllelesSample.patient_id, Patient.patient_name, Gene.name, Allele.name, Sample.sample_name, Gene.locus_order, Gene.alpha_order)\ .filter(Sample.id == AllelesSample.sample_id)\ .filter(Allele.id == AllelesSample.allele_id)\ .filter(Gene.id == Allele.gene_id)\ .filter(Patient.id == AllelesSample.patient_id)\ .filter(Sample.sample_name.in_(sample_list))\ .filter(Gene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': app_query = app_query.filter(Allele.is_single_allele == 1) appearances.extend(app_query.all()) for app in appearances: _, patient_name, gene, allele, sample, locus_order, alpha_order = app allele = allele.split('*', 1)[1].upper() if gene not in counts: if params['sort_order'] == 'Alphabetic': counts[gene] = [{}, [], alpha_order] else: counts[gene] = [{}, [], locus_order] if allele not in counts[gene][0]: counts[gene][0][allele] = [] if patient_name not in counts[gene][0][allele]: counts[gene][0][allele].append(patient_name) if patient_name not in counts[gene][1]: counts[gene][1].append(patient_name) gen_samples_by_dataset = {} for gen_sample in genomic_samples: if gen_sample['dataset'] not in gen_samples_by_dataset: gen_samples_by_dataset[gen_sample['dataset']] = [] gen_samples_by_dataset[gen_sample['dataset']].append( gen_sample['identifier']) for dataset in gen_samples_by_dataset.keys(): session = genomic_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(gen_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(GenomicSubject.identifier).filter( GenomicSubject.identifier.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] app_query = session.query(GenomicSubject.identifier, GenomicGene.name, GenomicSequence.name, GenomicGene.locus_order, GenomicGene.alpha_order)\ .filter(GenomicSubject.id == GenomicSubjectSequence.subject_id) \ .filter(GenomicSequence.id == GenomicSubjectSequence.sequence_id) \ .filter(GenomicGene.id == GenomicSequence.gene_id) \ .filter(GenomicSequence.type.in_(['V-REGION', 'D-REGION', 'J-REGION'])) \ .filter(GenomicSubject.identifier.in_(sample_list))\ .filter(GenomicGene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(GenomicSequence.novel == 0) appearances.extend(app_query.all()) for app in appearances: patient_name, gene, allele, locus_order, alpha_order = app allele = allele.split('*', 1)[1].upper() if gene not in counts: if params['sort_order'] == 'Alphabetic': counts[gene] = [{}, [], alpha_order] else: counts[gene] = [{}, [], locus_order] if allele not in counts[gene][0]: counts[gene][0][allele] = [] if patient_name not in counts[gene][0][allele]: counts[gene][0][allele].append(patient_name) if patient_name not in counts[gene][1]: counts[gene][1].append(patient_name) single_alleles = [] multi_alleles = [] for gene, (alleles, total, order) in counts.items(): row = [ gene, sorted(list(alleles.keys())), [len(alleles[a]) for a in sorted(alleles.keys())], len(total), order ] if len(alleles) > 1: multi_alleles.append(row) else: single_alleles.append(row) multi_alleles.sort(key=lambda row: row[4]) multi_alleles = [m[:4] for m in multi_alleles] single_alleles.sort(key=lambda row: row[4]) s = ['Single allele genes', [], []] for (gene, alleles, counts, _, _) in single_alleles: s[1].append(gene + '\n' + alleles[0]) s[2].append(counts[0]) multi_alleles.append(s) input_path = make_output_file('xls') output_path = make_output_file('pdf') book = xlwt.Workbook() for row in multi_alleles: if len(row[1]) > 0: write_gene(book, row) book.save(input_path) if format == 'xls': return send_report(input_path, format, '%s_allele_appearance.xls' % species) cmd_line = ["-i", input_path, "-o", output_path] if run_rscript(APPEARANCE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, '%s_allele_appearance.pdf' % species) else: raise BadRequest('No output from report')
def do_igsnper(species, dataset): export_dir = os.path.join(app.config['EXPORT_DIR'], 'vdjbase_metadata') ds_dir = os.path.abspath(os.path.join(export_dir, species, dataset)) if not os.path.isfile(os.path.join(ds_dir, 'db.sqlite3')): return 'No database found' igsnper_dir = os.path.join(ds_dir, 'samples', 'igsnper') db = ContentProvider(os.path.join(ds_dir, 'db.sqlite3')) # remove any existing igsnper related database fields db.session.query(Gene).update({Gene.igsnper_plot_path: ''}, synchronize_session=False) db.session.query(Sample).update({Sample.igsnper_plot_path: ''}, synchronize_session=False) db.session.query(Patient).update({Patient.igsnper_sample_id: 0}, synchronize_session=False) db.session.commit() # Create table of tigger files tigger_file_name = make_output_file('txt') with open(tigger_file_name, 'w') as fo: header = "TiggerFilePath ProjectID SubjectID\n" fo.write(header) samples = db.session.query( Sample.genotype, Sample.sample_name, Study.study_name, Patient.patient_name).join( Study, Sample.study_id == Study.id).join( Patient, Sample.patient_id == Patient.id).all() for sample in samples: if 'S1' in sample[1]: tigger_file_path = os.path.join(ds_dir, sample[0]) if os.path.isfile(tigger_file_path): fo.write('%s %s %s\n' % (tigger_file_path, sample[2], sample[3])) cmd_line = [ 'python', os.path.join(app.config['IGSNPER_PATH'], 'ig_snper.py') ] if os.path.isdir(igsnper_dir): shutil.rmtree(igsnper_dir, ignore_errors=True, onerror=None) args = ['-o', igsnper_dir, '-c', tigger_file_name] cmd_line.extend(args) print("Running IgSNPer: '%s'\n" % ' '.join(cmd_line)) proc = subprocess.Popen(cmd_line, cwd=app.config['IGSNPER_PATH'], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for stdout_line in iter(proc.stdout.readline, b''): print(stdout_line.decode("utf-8")) proc.stdout.close() return_code = proc.wait() for fn in glob(os.path.join(igsnper_dir, 'html_reports', '*.html')): gene = os.path.splitext(os.path.basename(fn))[0] if db.session.query(Gene).filter(Gene.name == gene).count() == 1: db.session.query(Gene).filter(Gene.name == gene).update({ Gene.igsnper_plot_path: 'igsnper/html_reports/%s.html' % gene }) else: print( 'Igsnper identified gene %s, which is not listed in the Genes table.' % gene) for fn in glob(os.path.join(igsnper_dir, '*_processed/*.txt')): subject = os.path.splitext(os.path.basename(fn))[0] study, individual = subject.split('_') sample_name = subject + '_S1' sample_query = db.session.query(Sample).filter( Sample.sample_name == sample_name) if sample_query.count() == 1: sample_query.update({ Sample.igsnper_plot_path: 'igsnper/%s_processed/%s.txt' % (study, subject) }) sample_id = sample_query.one_or_none().id db.session.query(Patient).filter( Patient.patient_name == subject).update( {Patient.igsnper_sample_id: sample_id}) else: print('Cant find the record for sample %s' % sample_name) db.session.commit() db.close() return ("IgSNP completed!")
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if format != 'xls': raise BadRequest('Invalid format requested') rep_samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in rep_samples_by_dataset: rep_samples_by_dataset[rep_sample['dataset']] = [] rep_samples_by_dataset[rep_sample['dataset']].append( rep_sample['sample_name']) imgt_refs = {} gene_order = {} sequences = {} all_wanted_genes = [] for dataset in rep_samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session refs = session.query(Allele).all() for ref in refs: if ref.novel == 0 and ref.name not in imgt_refs: imgt_refs[ref.name] = ref.seq.replace('.', '') if ref.name not in sequences: sequences[ref.name.upper()] = ref.seq.replace('.', '').lower() genes = session.query(Gene).all() for gene in genes: if gene.name not in gene_order: if params['sort_order'] == 'Alphabetic': gene_order[gene.name] = gene.alpha_order else: gene_order[gene.name] = gene.locus_order rep_counts = {} for dataset in rep_samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(rep_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) all_wanted_genes.extend(wanted_genes) sample_list = [s[0] for s in sample_list] app_query = session.query(AllelesSample.patient_id, Gene.name, Allele.name, Sample.sample_name, Patient.patient_name)\ .filter(Sample.id == AllelesSample.sample_id)\ .filter(Allele.id == AllelesSample.allele_id)\ .filter(Gene.id == Allele.gene_id)\ .filter(Patient.id == AllelesSample.patient_id)\ .filter(Sample.sample_name.in_(sample_list))\ .filter(Gene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': app_query = app_query.filter(Allele.is_single_allele == 1) appearances.extend(app_query.all()) for app in appearances: pid, gene, allele, sample, patient_name = app allele = allele.split('*', 1)[1].upper() if gene not in rep_counts: rep_counts[gene] = [{}, []] if allele not in rep_counts[gene][0]: rep_counts[gene][0][allele] = [] if patient_name not in rep_counts[gene][0][allele]: rep_counts[gene][0][allele].append(patient_name) if patient_name not in rep_counts[gene][1]: rep_counts[gene][1].append(patient_name) gen_samples_by_dataset = {} for gen_sample in genomic_samples: if gen_sample['dataset'] not in gen_samples_by_dataset: gen_samples_by_dataset[gen_sample['dataset']] = [] gen_samples_by_dataset[gen_sample['dataset']].append( gen_sample['identifier']) for dataset in gen_samples_by_dataset.keys(): session = genomic_dbs[species][dataset].session refs = session.query(GenomicSequence).all() for ref in refs: if ref.novel == 0 and ref.name not in imgt_refs: imgt_refs[ref.name] = ref.sequence.replace('.', '') if ref.name not in sequences: sequences[ref.name.upper()] = ref.sequence.replace('.', '').lower() genes = session.query(GenomicGene).all() for gene in genes: if gene.name not in gene_order: if params['sort_order'] == 'Alphabetic': gene_order[gene.name] = gene.alpha_order else: gene_order[gene.name] = gene.locus_order gen_counts = {} for dataset in gen_samples_by_dataset.keys(): session = genomic_dbs[species][dataset].session appearances = [] for sample_chunk in chunk_list(gen_samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(GenomicSubject.identifier).filter( GenomicSubject.identifier.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) all_wanted_genes.extend(wanted_genes) sample_list = [s[0] for s in sample_list] app_query = session.query(GenomicSubject.id, GenomicSubject.identifier, GenomicSubject.sequencing_platform, GenomicSubject.capture_probes, GenomicSequence.name, GenomicGene.name) \ .filter(GenomicSubject.id == GenomicSubjectSequence.subject_id) \ .filter(GenomicSequence.id == GenomicSubjectSequence.sequence_id) \ .filter(GenomicSequence.type.in_(['V-REGION', 'D-REGION', 'J-REGION'])) \ .filter(GenomicGene.id == GenomicSequence.gene_id)\ .filter(GenomicSubject.identifier.in_(sample_list))\ .filter(GenomicGene.name.in_(wanted_genes)) if params['novel_alleles'] == 'Exclude': app_query = app_query.filter(GenomicSequence.novel == 0) appearances.extend(app_query.all()) for app in appearances: _, patient_name, platform, probes, allele, gene = app allele = allele.split('*', 1)[1].upper() if gene not in gen_counts: gen_counts[gene] = [{}, [], {}, {}] if allele not in gen_counts[gene][0]: gen_counts[gene][0][allele] = [] gen_counts[gene][2][allele] = [] gen_counts[gene][3][allele] = [] if patient_name not in gen_counts[gene][0][allele]: gen_counts[gene][0][allele].append(patient_name) if patient_name not in gen_counts[gene][1]: gen_counts[gene][1].append(patient_name) if platform and platform not in gen_counts[gene][2][allele]: gen_counts[gene][2][allele].append(platform) if probes and probes not in gen_counts[gene][3][allele]: gen_counts[gene][3][allele].append(probes) imgt_counts = {} all_wanted_genes = list(set(all_wanted_genes)) for ref in imgt_refs.keys(): ref = ref.upper() gene, allele = ref.split('*') if gene in all_wanted_genes: if gene not in imgt_counts: imgt_counts[gene] = [{}, [1]] if allele not in imgt_counts[gene][0] and allele != 'DEL': imgt_counts[gene][0][allele] = [1] headers = ['Allele', 'IMGT', 'AIRR-Seq', 'Genomic'] genes_in_order = sorted(gene_order.items(), key=lambda x: x[1]) genes_in_order = [g[0] for g in genes_in_order] results = [] for gene in genes_in_order: # Assemble the set of alleles to list for this gene ref_alleles = [] novel_alleles = [] for counts in [imgt_counts, rep_counts, gen_counts]: if gene in counts: for allele in counts[gene][0].keys(): if '_' in allele: if allele not in novel_alleles: novel_alleles.append(allele) else: if allele not in ref_alleles: ref_alleles.append(allele) ref_alleles.sort() novel_alleles.sort() ref_alleles.extend(novel_alleles) def allele_count(gene, allele, counts): if gene not in counts: return 0 if allele not in counts[gene][0]: return 0 return len(counts[gene][0][allele]) def best_platform(gene, allele, counts): platforms = ['RS', 'SEQUEL', 'SEQUELII'] if gene not in counts: return '' if allele not in counts[gene][2]: return '' best = '' for platform in platforms: if platform in counts[gene][2][allele]: best = platform return best def best_probes(gene, allele, counts): probes = ['V2', 'V3'] if gene not in counts: return '' if allele not in counts[gene][3]: return '' best = '' for probe in probes: if probe in counts[gene][3][allele]: best = probe return best for allele in ref_alleles: row = { 'Allele': f'{gene}*{allele}', 'IMGT': allele_count(gene, allele, imgt_counts), 'AIRR-Seq': allele_count(gene, allele, rep_counts), 'Genomic': allele_count(gene, allele, gen_counts), 'Best platform': best_platform(gene, allele, gen_counts), 'Best probes': best_probes(gene, allele, gen_counts), 'Sequence': sequences[f'{gene}*{allele}'.upper()] } results.append(row) output_path = make_output_file('csv') write_csv(output_path, results) return send_report(output_path, 'csv', f'{species}_allele_usage.csv')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format not in ['pdf', 'html']: raise BadRequest('Invalid format requested') html = (format == 'html') chain, samples_by_dataset = collate_samples(rep_samples) genotypes = pd.DataFrame() for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session primer_trans, gene_subs = find_primer_translations(session) sample_list = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list.extend( session.query(Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all()) sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) if len(wanted_genes) > 0: for (name, genotype, patient_id) in sample_list: sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, genotype.replace('samples/', '')) if not os.path.isfile(sample_path): continue genotype = pd.read_csv(sample_path, sep='\t', dtype=str) genotype = trans_df(genotype) # translate pipeline allele names to VDJbase allele names for col in ['alleles', 'GENOTYPED_ALLELES']: genotype[col] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(genotype['gene'], genotype[col]) ] genotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in genotype['gene'] ] genotype = genotype[genotype.gene.isin(wanted_genes)] subject_name = name if len( samples_by_dataset) == 1 else dataset + '_' + name if 'subject' not in genotype.columns.values: genotype.insert(0, 'subject', subject_name) else: genotype.subject = subject_name genotypes = genotypes.append(genotype)[ genotype.columns.tolist()] if len(genotypes) == 0: raise BadRequest('No records matching the filter criteria were found.') geno_path = make_output_file('csv') genotypes.to_csv(geno_path, sep='\t') if format == 'pdf': attachment_filename = '%s_genotype.pdf' % species else: attachment_filename = None locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) output_path = make_output_file('html' if html else 'pdf') file_type = 'T' if html else 'F' cmd_line = [ "-i", geno_path, "-o", output_path, "-t", file_type, "-k", str(params['f_kdiff']), "-c", chain, "-g", gene_order_file ] if run_rscript(HEATMAP_GENOTYPE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'pdf': raise BadRequest('Invalid format requested') html = (format == 'html') chain, samples_by_dataset = collate_samples(rep_samples) haplotypes = pd.DataFrame() for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session primer_trans, gene_subs = find_primer_translations(session) haplos = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] haplo_query = session.query(Sample.sample_name, HaplotypesFile.file)\ .filter(Sample.sample_name.in_(sample_list))\ .join(SamplesHaplotype, Sample.id == SamplesHaplotype.samples_id)\ .filter(SamplesHaplotype.haplotypes_file_id == HaplotypesFile.id)\ .filter(HaplotypesFile.by_gene == params['haplo_gene']) haplos.extend(haplo_query.all()) for name, filename in haplos: sample_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, filename.replace('samples/', '')) if not os.path.isfile(sample_path): raise BadRequest('Haplotype file %s is missing.' % (sample_path)) haplotype = pd.read_csv(sample_path, sep='\t', dtype=str) haplotype = trans_df(haplotype) haplotype['subject'] = name if len( samples_by_dataset) == 1 else dataset + '_' + name # translate pipeline allele names to VDJbase allele names col_names = list(haplotype.columns.values) for i in (2, 3, 4): haplotype[col_names[i]] = [ translate_primer_alleles(x, y, primer_trans) for x, y in zip(haplotype['gene'], haplotype[col_names[i]]) ] haplotype['gene'] = [ translate_primer_genes(x, gene_subs) for x in haplotype['gene'] ] haplotype = haplotype[haplotype.gene.isin(wanted_genes)] haplotypes = pd.concat( [haplotypes, haplotype], keys=None, ignore_index=True)[haplotype.columns.tolist()] if len(haplotypes) == 0: raise BadRequest('No records matching the filter criteria were found.') haplo_path = make_output_file('tsv') haplotypes.to_csv(haplo_path, sep='\t', index=False) attachment_filename = '%s_haplotype_heatmap.pdf' % species if not params['f_kdiff'] or params['f_kdiff'] == '': params['f_kdiff'] = 0 locus_order = ('sort_order' in params and params['sort_order'] == 'Locus') gene_order_file = get_multiple_order_file(species, samples_by_dataset.keys(), locus_order=locus_order) output_path = make_output_file('html' if html else 'pdf') cmd_line = [ "-i", haplo_path, "-o", output_path, "-k", str(params['f_kdiff']), "-c", chain, "-g", gene_order_file ] if run_rscript(HEATMAP_HAPLOTYPE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format, attachment_filename) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'html': raise BadRequest('Invalid format requested') kdiff = float(params['f_kdiff'] ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0 chain, samples_by_dataset = collate_samples(rep_samples) if len(samples_by_dataset ) > 1 and params['ambiguous_alleles'] != 'Exclude': raise BadRequest( 'Ambiguous alleles cannot be processed across multiple datasets') # Format we need to produce is [(gene_name, hetero count, h**o count),...] gene_allele_counts = {} for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session allele_recs = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] query = session.query(Gene.name, Allele.id, Gene.type) \ .join(Allele) \ .join(AllelesSample) \ .join(Sample) \ .join(Patient, Patient.id == Sample.patient_id) \ .filter(Gene.name.in_(wanted_genes)) \ .filter(Allele.name.notlike('%Del%')) \ .filter(Allele.name.notlike('%OR%')) \ .filter(Sample.sample_name.in_(sample_list)) \ .filter(AllelesSample.kdiff >= kdiff) if 'sort_order' in params and params['sort_order'] == 'Locus': query = query.order_by(Gene.locus_order, Patient.id, Allele.id) else: query = query.order_by(Gene.alpha_order, Patient.id, Allele.id) if params['novel_alleles'] == 'Exclude': query = query.filter(Allele.novel == 0) if params['ambiguous_alleles'] == 'Exclude': query = query.filter(Allele.is_single_allele == 1) allele_recs.extend(query.all()) i = 0 while i < len(allele_recs): (gene_name, allele_id, gene_type) = allele_recs[i] gene_allele_ids = [] while i < len(allele_recs): if allele_recs[i][0] != gene_name: break allele_id = allele_recs[i][1] gene_allele_ids.append(allele_id) i += 1 gene_allele_ids = set(gene_allele_ids) # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one, # drop the unambiguous one because it is already counted if (params['ambiguous_alleles'] != 'Exclude'): patterns = session.query(AllelesPattern.pattern_id)\ .filter(AllelesPattern.allele_in_p_id.in_(gene_allele_ids))\ .filter(AllelesPattern.pattern_id.in_(gene_allele_ids))\ .all() if patterns is not None and len(patterns) > 0: patterns = set([pattern[0] for pattern in patterns]) gene_allele_ids = gene_allele_ids - patterns if gene_name not in gene_allele_counts: gene_allele_counts[gene_name] = gene_allele_ids else: gene_allele_counts[gene_name] |= gene_allele_ids listed_allele_count = [] for gene, alleles in gene_allele_counts.items(): listed_allele_count.append((gene, len(alleles))) labels = ['GENE', 'COUNT'] input_path = make_output_file('tab') df = pd.DataFrame(listed_allele_count, columns=labels) df.to_csv(input_path, sep='\t', index=False) output_path = make_output_file('html') cmd_line = ["-i", input_path, "-o", output_path, "-c", chain] if run_rscript(ALLELE_USAGE_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format) else: raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): datasets = [] for sample in rep_samples: if sample['dataset'] not in datasets: datasets.append(sample['dataset']) if len(datasets) != 2: raise BadRequest('Please select exactly two AIRR-seq datasets to compare.') if format != 'html': raise BadRequest('Invalid format requested') output_path = make_output_file('html') with open(output_path, 'w') as fo: session = [] alleles = [] allele_names = [] allele_similars = [] allele_lookups = [{}, {}] for i in (0, 1): session.append(vdjbase_dbs[species][datasets[i]].session) alleles.append(session[i].query(Allele).all()) allele_names.append(set([allele.name for allele in alleles[i]])) allele_similars.append({}) for allele in alleles[i]: if allele.similar is not None: sims = [x.replace('|', '') for x in allele.similar.split(', ')] for sim in sims: allele_similars[i][sim] = allele.name allele_lookups[i][allele.name] = allele common_allele_names = list(allele_names[0] & allele_names[1]) fo.write('<h2>Comparison of %s and %s</h2>' % (datasets[0], datasets[1])) fo.write('<h2>Alleles only in %s</h2>' % datasets[0]) exc = list(allele_names[0] - allele_names[1]) exc_com = [('%s (%s in %s)' % (x, allele_similars[1][x], datasets[1]) if x in allele_similars[1] else x) for x in exc] fo.write('<br>'.join(exc_com)) fo.write('<h2>Alleles only in %s</h2>' % datasets[1]) exc = list(allele_names[1] - allele_names[0]) exc_com = [('%s (%s in %s)' % (x, allele_similars[0][x], datasets[0]) if x in allele_similars[0] else x) for x in exc] fo.write('<br>'.join(exc_com)) fo.write('<h2>Changed appearance counts</h2>') fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1])) for allele in common_allele_names: if allele_lookups[0][allele].appears != allele_lookups[1][allele].appears: fo.write('<tr><th>%s</th><th>%d</th><th>%d</th></tr>' % (allele, allele_lookups[0][allele].appears, allele_lookups[1][allele].appears)) fo.write('</table>') fo.write('<h2>Changed max_kdiffs</h2>') fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1])) for allele in common_allele_names: if abs(allele_lookups[0][allele].max_kdiff - allele_lookups[1][allele].max_kdiff) > 0.1: fo.write('<tr><th>%s</th><th>%.2f</th><th>%.2f</th></tr>' % (allele, allele_lookups[0][allele].max_kdiff, allele_lookups[1][allele].max_kdiff)) fo.write('</table>') fo.write('<h2>Changed confidence levels</h2>') fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1])) for allele in common_allele_names: if allele_lookups[0][allele].low_confidence != allele_lookups[1][allele].low_confidence: fo.write('<tr><th>%s</th><th>%s</th><th>%s</th></tr>' % (allele, 'low' if allele_lookups[0][allele].low_confidence else 'high', 'low' if allele_lookups[1][allele].low_confidence else 'high')) fo.write('</table>') fo.write('<h2>Changed number of notes</h2>') fo.write('<table><tr><th>Allele</th><th>%s</th><th>%s</th></tr>' % (datasets[0], datasets[1])) for allele in common_allele_names: c0 = session[0].query(AlleleConfidenceReport).filter(AlleleConfidenceReport.allele_id == allele_lookups[0][allele].id).count() c1 = session[1].query(AlleleConfidenceReport).filter(AlleleConfidenceReport.allele_id == allele_lookups[1][allele].id).count() if c0 != c1: fo.write('<tr><th>%s</th><th>%d</th><th>%d</th></tr>' % (allele, c0, c1)) fo.write('</table>') return send_report(output_path, 'html')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if 'Sample info' in params['type']: samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in samples_by_dataset: samples_by_dataset[rep_sample['dataset']] = [] samples_by_dataset[rep_sample['dataset']].append(rep_sample['sample_name']) attribute_query = [] headers = [] for name, filter in sample_info_filters.items(): if filter['model'] is not None: attribute_query.append(filter['field']) headers.append(name) rows = [] for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.sample_name, Sample.genotype, Sample.patient_id).filter(Sample.sample_name.in_(sample_chunk)).all() sample_list = [s[0] for s in sample_list] results = session.query(*attribute_query)\ .join(GenoDetection, GenoDetection.id == Sample.geno_detection_id)\ .join(Patient, Patient.id == Sample.patient_id)\ .join(SeqProtocol)\ .join(TissuePro)\ .join(Study, Sample.study_id == Study.id)\ .filter(Sample.sample_name.in_(sample_list)).all() rows.extend(results) outfile = make_output_file('csv') with open(outfile, 'w', newline='') as fo: writer = csv.writer(fo, dialect='excel') writer.writerow(headers) for row in rows: writer.writerow(row) return send_report(outfile, 'csv', attachment_filename='sample_info.csv') elif 'Sample files' in params['type']: outfile = make_output_file('zip') with zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED) as fo: samples_by_dataset = {} for rep_sample in rep_samples: if rep_sample['dataset'] not in samples_by_dataset: samples_by_dataset[rep_sample['dataset']] = [] samples_by_dataset[rep_sample['dataset']].append(rep_sample['sample_name']) added_files = [] # handle multiple samples in same dir etc added_dirs = [] for dataset in samples_by_dataset.keys(): print('adding dataset') session = vdjbase_dbs[species][dataset].session for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query(Sample.genotype, Sample.igsnper_plot_path).filter(Sample.sample_name.in_(sample_chunk)).all() for p1, p2 in sample_list: if p1 is not None and len(p1) > 0: sample_dir = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, os.path.dirname(p1.replace('samples/', ''))) if sample_dir not in added_dirs: zipdir(sample_dir, fo, os.path.join(VDJBASE_SAMPLE_PATH, species)) # sample files added_dirs.append(sample_dir) if p2 is not None and len(p2) > 0: igsnper_path = os.path.join(VDJBASE_SAMPLE_PATH, species, dataset, p2) if igsnper_path not in added_files: fo.write(igsnper_path, arcname=igsnper_path.replace(os.path.join(VDJBASE_SAMPLE_PATH, species), '')) added_files.append(igsnper_path) return send_report(outfile, 'zip', attachment_filename='sample_data.zip') elif 'Ungapped' in params['type'] or 'Gapped' in params['type']: required_cols = ['name', 'seq', 'dataset'] seqs = find_sequences(params, rep_samples, species, required_cols) recs = [] for seq in seqs: id = '%s|%s|%s' % (seq['name'], species, seq['dataset']) recs.append(SeqRecord(Seq(seq['seq'] if 'Gapped' in params['type'] else seq['seq'].replace('.', '')), id=id, description='')) outfile = make_output_file('fasta') SeqIO.write(recs, outfile, "fasta") return send_report(outfile, 'fasta', attachment_filename='%s_sequences.fasta' % species) elif 'Gene info' in params['type']: headers = [] for name, att_filter in sequence_filters.items(): if att_filter['model'] is not None: headers.append(name) headers.append('dataset') rows = find_sequences(params, rep_samples, species, headers) outfile = make_output_file('csv') with open(outfile, 'w', newline='') as fo: writer = csv.DictWriter(fo, dialect='excel', fieldnames=headers) writer.writeheader() for row in rows: writer.writerow(row) return send_report(outfile, 'csv', attachment_filename='sequence_info.csv') raise BadRequest('No output from report')
def run(format, species, genomic_datasets, genomic_samples, rep_datasets, rep_samples, params): if len(rep_samples) == 0: raise BadRequest('No repertoire-derived genotypes were selected.') if format != 'html': raise BadRequest('Invalid format requested') kdiff = float(params['f_kdiff'] ) if 'f_kdiff' in params and params['f_kdiff'] != '' else 0 chain, samples_by_dataset = collate_samples(rep_samples) # Format we need to produce is [(gene_name, hetero count, h**o count),...] gene_hetrozygous_dis = {} for dataset in samples_by_dataset.keys(): session = vdjbase_dbs[species][dataset].session allele_sample_recs = [] for sample_chunk in chunk_list(samples_by_dataset[dataset], SAMPLE_CHUNKS): sample_list = session.query( Sample.sample_name, Sample.genotype, Sample.patient_id).filter( Sample.sample_name.in_(sample_chunk)).all() sample_list, wanted_genes = apply_rep_filter_params( params, sample_list, session) sample_list = [s[0] for s in sample_list] query = session.query(Gene.name, Patient.id, Allele.id, Sample.sample_name, Gene.locus_order, AllelesSample.kdiff, Allele.name) \ .join(Allele, Gene.id == Allele.gene_id) \ .join(AllelesSample, Allele.id == AllelesSample.allele_id) \ .join(Sample, Sample.id == AllelesSample.sample_id) \ .join(Patient, Patient.id == Sample.patient_id) \ .filter(Gene.name.in_(wanted_genes)) \ .filter(Allele.name.notlike('%Del%')) \ .filter(Allele.name.notlike('%OR%')) \ .filter(Sample.sample_name.in_(sample_list)) \ .filter(AllelesSample.kdiff >= kdiff) if 'sort_order' in params and params['sort_order'] == 'Locus': query = query.order_by(Gene.locus_order, Patient.id, Allele.id) else: query = query.order_by(Gene.alpha_order, Patient.id, Allele.id) if params['ambiguous_alleles'] == 'Exclude': query = query.filter(Allele.is_single_allele == True) allele_sample_recs.extend(query.all()) # As the result is indexed, run over each gene in turn, count the number of alleles found in each patient, update h_counts accordingly i = 0 target_gene = '' while i < len(allele_sample_recs): target_gene = allele_sample_recs[i][0] h_counts = [0, 0] while i < len(allele_sample_recs): if allele_sample_recs[i][0] != target_gene: break target_patient = allele_sample_recs[i][1] patient_allele_ids = [] while i < len(allele_sample_recs): if allele_sample_recs[i][ 0] != target_gene or allele_sample_recs[i][ 1] != target_patient: break patient_allele_ids.append(allele_sample_recs[i][2]) i += 1 patient_allele_ids = set(patient_allele_ids) # If we have both an unambiguous allele and an ambiguous allele containing that unambiguous one, # drop the unambiguous one because it is already counted if params['ambiguous_alleles'] != 'Exclude': patterns = session.query(AllelesPattern.pattern_id)\ .filter(AllelesPattern.allele_in_p_id.in_(patient_allele_ids))\ .filter(AllelesPattern.pattern_id.in_(patient_allele_ids))\ .all() if patterns is not None and len(patterns) > 0: patterns = set([pattern[0] for pattern in patterns]) patient_allele_ids = patient_allele_ids - patterns if len(patient_allele_ids) > 1: h_counts[1] += 1 elif len(patient_allele_ids) > 0: h_counts[0] += 1 if target_gene not in gene_hetrozygous_dis: gene_hetrozygous_dis[target_gene] = (target_gene, h_counts[0], h_counts[1]) else: gene_hetrozygous_dis[target_gene] = ( target_gene, gene_hetrozygous_dis[target_gene][1] + h_counts[0], gene_hetrozygous_dis[target_gene][2] + h_counts[1]) haplo_path = make_output_file('tab') labels = ['GENE', 'HM', 'HT'] df = pd.DataFrame(gene_hetrozygous_dis.values(), columns=labels) df.to_csv(haplo_path, sep='\t', index=False) output_path = make_output_file('html') cmd_line = [ "-i", haplo_path, "-o", output_path, "-c", chain, ] if run_rscript(HETEROZYGOSITY_SCRIPT, cmd_line) and os.path.isfile( output_path) and os.path.getsize(output_path) != 0: return send_report(output_path, format) else: raise BadRequest('No output from report')