def handle(self, *args, **options): project_id = options['project_id'] print("Loading data into project: " + project_id) project = Project.objects.get(project_id = project_id) cnv_filename = options['cnv_filename'] bed_files_directory = options['bed_files_directory'] if not os.path.isfile(cnv_filename): raise ValueError("CNV file %s doesn't exist" % options['cnv_filename']) with open(cnv_filename) as f: header_fields = f.readline().rstrip('\n').split('\t') for line in f: fields = line.rstrip('\n').split('\t') row_dict = dict(zip(header_fields, fields)) chrom = "chr"+row_dict['chr'] start = int(row_dict['start']) end = int(row_dict['end']) #left_overhang = int(row_dict['left_overhang_start']) #right_overhang = int(row_dict['right_overhang_end']) sample_id = row_dict['sample'] try: i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id) except Exception as e: print("WARNING: %s: %s not found in %s" % (e, sample_id, project)) continue bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id) if not os.path.isfile(bed_file_path): print("WARNING: .bed file not found: " + bed_file_path) if i.cnv_bed_file != bed_file_path: print("Setting cnv_bed_file path to %s" % bed_file_path) i.cnv_bed_file = bed_file_path i.save() project_collection = get_project_datastore(project)._get_project_collection(project_id) family_collection = get_mall(project).variant_store._get_family_collection(project_id, i.family.family_id) for collection in filter(None, [project_collection, family_collection]): collection.update_many( {'$and': [ {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)} }, {'xpos': {'$lte': genomeloc.get_single_location(chrom, end)}} ]}, {'$set': {'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict}}) #result = list(collection.find({'$and' : [ # {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)}}, # {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]}, # {'genotypes.%s.extras.cnvs' % i.indiv_id :1 })) #print(chrom, start, end, len(result), result[0] if result else None) print("Done")
def iterate_coverage_bed_file(bed_file): for line in bed_file: fields = line.strip().split('\t') chr = 'chr' + fields[0] start = int(fields[1]) end = int(fields[2])-1 xstart = genomeloc.get_single_location(chr, start) xstop = genomeloc.get_single_location(chr, end) coverage = constants.COVERAGE_TAG_MAP[fields[3]] yield CoverageInterval(xstart=xstart, xstop=xstop, coverage=coverage)
def get_variant_from_vcf_fields(vcf_fields, alt_allele_pos): """ Get a basic variant from vcf_fields, for allele given by alt_allele_pos """ chrom = vcf_fields[0] if 'chr' in vcf_fields[0] else 'chr' + vcf_fields[0] pos = int(vcf_fields[1]) # if we can't get a genomic location, just ignore it and print a message humans will ignore too # obviously need a better way to approach this if not genomeloc.valid_pos(chrom, pos): print "ERROR: could not figure out coordinates for %s:%d...maybe a nonstandard chromosome?" % (chrom, pos) return None ref = vcf_fields[3] orig_alt_alleles = vcf_fields[4].split(',') alt = orig_alt_alleles[alt_allele_pos] xpos = genomeloc.get_single_location(chrom, pos) xpos, ref, alt = get_minimal_representation(xpos, ref, alt) variant = Variant(xpos, ref, alt) variant.set_extra('alt_allele_pos', alt_allele_pos) variant.set_extra('orig_alt_alleles', orig_alt_alleles) if vcf_fields[2] and vcf_fields[2] != '.': variant.vcf_id = vcf_fields[2] return variant
def iterate_coverage_bed_file(bed_file): for line in bed_file: fields = line.strip().split('\t') chr = 'chr' + fields[0] start = int(fields[1]) end = int(fields[2]) xstart = genomeloc.get_single_location(chr, start) xend = genomeloc.get_single_location(chr, end) coverage = constants.COVERAGE_TAG_MAP[fields[3]] yield { 'xstart': xstart, 'xend': xend, 'xpos': {'xstart': xstart, 'xend': xend}, # for geospatial indexing 'coverage': coverage, }
def get_all_exons(self): """ Get a list of all exons (order not guaranteed) from ensembl Fetched from database, not REST """ cursor = self.db_conn.cursor() cursor.execute("select exon.stable_id, seq_region.name, exon.seq_region_start, exon.seq_region_end from exon " "join seq_region on exon.seq_region_id=seq_region.seq_region_id") exons = [] for row in cursor: chr = ensembl_parsing_utils.get_chr_from_seq_region_name(row[1]) start = row[2] stop = row[3] if chr is None: continue exon = dict(exon_id=row[0]) exon['xstart'] = genomeloc.get_single_location(chr, start) exon['xstop'] = genomeloc.get_single_location(chr, stop) exons.append(exon) return exons
def load_coverage_file(path): """Load the given ExAC coverage file""" print("Loading file: " + path) with gzip.open(path) as f: header = next(f).replace("#chrom", "chrom").rstrip('\n').split('\t') for line in f: # tqdm(f, unit=' lines'): fields = line.rstrip('\n').split('\t') fields[2:] = map(float, fields[2:]) # covert stats to float values = dict(zip(header, fields)) chrom = 'chr'+values['chrom'] values['pos'] = int(values['pos']) xpos = genomeloc.get_single_location(chrom, values['pos']) values['xpos'] = xpos #print("Inserting " + str(values)) COVERAGE_DB.exac_v3_coverage.insert(values)
def get_exac_af(chrom, pos, ref, alt): populations = ['AMR', 'EAS', 'FIN', 'NFE', 'SAS', 'AFR'] chrom_without_chr = chrom.replace("chr", "") xpos = genomeloc.get_single_location(chrom, pos) variant_length = len(ref) + len(alt) # check whether the alleles match matching_exac_variant = None matching_exac_variant_i = None for record in exac_vcf.fetch(chrom_without_chr, pos - variant_length, pos + variant_length): exac_xpos = genomeloc.get_xpos(record.CHROM, record.POS) for exac_alt_i, exac_alt in enumerate(record.ALT): exac_variant_xpos, exac_ref, exac_alt = get_minimal_representation(exac_xpos, str(record.REF), str(exac_alt)) if exac_variant_xpos == xpos and exac_ref == ref and exac_alt == alt: if matching_exac_variant is not None: print("ERROR: multiple exac variants match the variant: %s %s %s %s" % (chrom, pos, ref, alt)) matching_exac_variant = record matching_exac_variant_i = exac_alt_i #print("Variant %s %s %s matches: %s %s %s %s" % (xpos, ref, alt, record, exac_variant_xpos, exac_ref, exac_alt) ) if matching_exac_variant is None: #print("Variant %s %s %s %s not found in ExAC" % (chrom, pos, alt, ref)) return None, None, None pop_max_af = -1 pop_max_population = None for p in populations: if matching_exac_variant.INFO['AN_'+p] > 0: pop_af = matching_exac_variant.INFO['AC_'+p][matching_exac_variant_i]/float(matching_exac_variant.INFO['AN_'+p]) if pop_af > pop_max_af: pop_max_af = pop_af pop_max_population = p if matching_exac_variant.INFO['AN_Adj'] != 0: global_af = float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i])/float(matching_exac_variant.INFO['AN_Adj']) else: assert float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i]) == 0 global_af = 0 return global_af, pop_max_af, pop_max_population
def handle_individual(self, project, individual): project_id = project.project_id individual_id = individual.indiv_id print("Processing individual %s" % individual_id) # get variants that have been tagged or that have a note that starts with "REPORT" variants_in_report_and_notes = defaultdict(str) for vt in VariantTag.objects.filter(project_tag__project=project, project_tag__tag="REPORT", family=individual.family): variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = "" for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and vn.note.strip().startswith("REPORT"): variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = "" header = ["gene_name", "genotype", "variant", "functional_class", "hgvs_c", "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af", "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars", "clinvar_url", "comments"] if len(variants_in_report_and_notes) != 0: with open("report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out: #print("\t".join(header)) out.write("\t".join(header) + "\n") # retrieve text of all notes that were left for any of these variants for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes: other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] if len(other_notes) > 0: other_notes += "||" variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % (vn.date_saved, vn.user.email, vn.note.strip()) for (xpos, ref, alt), notes in variants_in_report_and_notes.items(): #chrom, pos = genomeloc.get_chr_pos(xpos) v = get_mall(project_id).variant_store.get_single_variant(project_id, individual.family.family_id, xpos, ref, alt) if v is None: raise ValueError("Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt)) row = self.get_output_row(v, xpos, ref, alt, individual_id, individual.family, all_fields=True, comments=notes) if row is None: continue #print("\t".join(row)) out.write("\t".join(row) + "\n") #print(variant_tag.project_tag.title, variant_tag.project_tag.tag, variant_tag.xpos, variant_tag.ref, variant_tag.alt) with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out: header = ["gene_chrom", "gene_start", "gene_end"] + header + ["json_dump"] #print("\t".join(header)) out.write("\t".join(header) + "\n") for gene_id, (chrom, start, end) in gene_loc.items(): xpos_start = genomeloc.get_single_location("chr" + chrom, start) xpos_end = genomeloc.get_single_location("chr" + chrom, end) for v in get_mall(project_id).variant_store.get_variants_in_range(project_id, individual.family.family_id, xpos_start, xpos_end): json_dump = str(v.genotypes) try: notes = variants_in_report_and_notes[(v.xpos, v.ref, v.alt)] except KeyError: notes = "" row = self.get_output_row(v, v.xpos, v.ref, v.alt, individual_id, individual.family, comments=notes, gene_id=gene_id) if row is None: continue row = map(str, [chrom, start, end] + row + [json_dump]) #print("\t".join(row)) out.write("\t".join(row) + "\n")
def get_gene_structure(self, gene_id): """ Query ensembl API for the transcript/exon structure of a gene This is the foundation of the elements in db.genes Exception if can't process gene """ gene = {} # gene basics url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'gene'} gene_list_json = requests.get(url, params=params).json() gene_list_json = [item for item in gene_list_json if item['ID'] == gene_id] if len(gene_list_json) == 0: raise Exception("No genes with ID %s" % gene_id) if len(gene_list_json) > 1: raise Exception(">1 ensembl genes with ID %s" % gene_id) gene_json = gene_list_json[0] chr = ensembl_parsing_utils.get_chr_from_seq_region_name(gene_json['seq_region_name']) if chr is None: raise Exception("Gene %s is on a nonstandard chromosome: %s" % (gene_id, chr) ) gene['chr'] = chr gene['start'] = gene_json['start'] gene['stop'] = gene_json['end'] gene['xstart'] = genomeloc.get_single_location(chr, gene['start']) gene['xstop'] = genomeloc.get_single_location(chr, gene['stop']) gene['gene_id'] = gene_json['ID'] gene['symbol'] = gene_json['external_name'] gene['description'] = gene_json['description'] gene['biotype'] = gene_json['biotype'] # transcripts url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'transcript'} transcript_json = [t for t in requests.get(url, params=params).json() if t['Parent'] == gene_id] gene['transcripts'] = [] for t in transcript_json: transcript_id = t['ID'] transcript = dict( transcript_id=transcript_id, biotype=t['biotype'], start=t['start'], stop=t['end'] ) transcript['xstart'] = genomeloc.get_single_location(chr, transcript['start']) transcript['xstop'] = genomeloc.get_single_location(chr, transcript['stop']) # exons_for_transcript url = self._get_rest_url() + '/feature/id/%s' % transcript_id params = {'content-type': 'application/json', 'feature': 'exon'} transcript_exon_json = requests.get(url, params=params).json() transcript['exons'] = [ e['ID'] for e in sorted(transcript_exon_json, key=lambda x: x['start']) if e['Parent'] == transcript_id ] gene['transcripts'].append(transcript) # exons url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'exon'} exon_json = requests.get(url, params=params).json() transcript_ids = {t['transcript_id'] for t in gene['transcripts']} exon_ids_seen = set() gene['exons'] = [] for e in exon_json: exon_id = e['ID'] # skip exons that aren't actually in one of this gene's transcripts if e['Parent'] not in transcript_ids: continue if exon_id in exon_ids_seen: continue exon = { 'exon_id': exon_id, 'start': e['start'], 'stop': e['end'], } exon['xstart'] = genomeloc.get_single_location(chr, exon['start']) exon['xstop'] = genomeloc.get_single_location(chr, exon['stop']) gene['exons'].append(exon) exon_ids_seen.add(e['ID']) # cds url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'cds'} cds_json = requests.get(url, params=params).json() cds_map = {} # map from (start, stop) -> {start, stop, transcripts} for c in cds_json: # skip exons that aren't actually in one of this gene's transcripts if c['Parent'] not in transcript_ids: continue cds_t = (c['start'], c['end']) if cds_t not in cds_map: cds_map[cds_t] = { 'start': c['start'], 'stop': c['end'], 'xstart': genomeloc.get_single_location(chr, c['start']), 'xstop': genomeloc.get_single_location(chr, c['end']), 'transcripts': [], } cds_map[cds_t]['transcripts'].append(c['Parent']) gene['cds'] = sorted(cds_map.values(), key=lambda x: (x['start'], x['stop'])) for i, cds in enumerate(gene['cds']): cds['cds_id'] = '%s-%i' % (gene['gene_id'], i+1) return gene
def handle_individual(self, project, individual): project_id = project.project_id individual_id = individual.indiv_id # get variants that have been tagged or that have a note that starts with "REPORT" variants_in_report_and_notes = defaultdict(str) for vt in VariantTag.objects.filter(project_tag__project=project, project_tag__tag="REPORT", family=individual.family): variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = "" for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and vn.note.strip().startswith("REPORT"): variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = "" if len(variants_in_report_and_notes) == 0: print( "skipping individual %s since no variants are tagged in family %s..." % (individual_id, individual.family.family_id)) return header = [ "gene_name", "genotype", "variant", "hgvs_c", "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af", "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars", "clinvar_url", "comments" ] with open("report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out: #print("\t".join(header)) out.write("\t".join(header) + "\n") # retrieve text of all notes that were left for any of these variants for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes: other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] if len(other_notes) > 0: other_notes += "||" variants_in_report_and_notes[( vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % ( vn.date_saved, vn.user.email, vn.note.strip()) for (xpos, ref, alt), notes in variants_in_report_and_notes.items(): #chrom, pos = genomeloc.get_chr_pos(xpos) v = get_mall(project_id).variant_store.get_single_variant( project_id, individual.family.family_id, xpos, ref, alt) if v is None: raise ValueError( "Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt)) row = self.get_output_row(v, v.ref, v.alt, individual.indiv_id, individual.family, all_fields=True, comments=notes) if row is None: continue #print("\t".join(row)) out.write("\t".join(row) + "\n") #print(variant_tag.project_tag.title, variant_tag.project_tag.tag, variant_tag.xpos, variant_tag.ref, variant_tag.alt) with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out: header = ["gene_chrom", "gene_start", "gene_end" ] + header + ["json_dump"] #print("\t".join(header)) out.write("\t".join(header) + "\n") for gene_name, (chrom, start, end) in gene_loc.items(): xpos_start = genomeloc.get_single_location( "chr" + chrom, start) xpos_end = genomeloc.get_single_location("chr" + chrom, end) for v in get_mall( project_id).variant_store.get_variants_in_range( project_id, individual.family.family_id, xpos_start, xpos_end): json_dump = str(v.genotypes) for alt in v.alt.split(","): try: notes = variants_in_report_and_notes[(v.xpos, v.ref, alt)] except KeyError: notes = "" row = self.get_output_row(v, v.ref, alt, individual.indiv_id, individual.family, comments=notes) if row is None: continue row = map(str, [chrom, start, end] + row + [json_dump]) #print("\t".join(row)) out.write("\t".join(row) + "\n")
def write_snp_fileset(family, output_dir_path): """ Write a set of files for a family that can be passed to linkage engine Creates the following files: variants.txt [family_id].fam markers.txt disease_model.json """ individuals = family.get_individuals() # fam file fam_file_path = os.path.join(output_dir_path, family.family_id + '.fam') f = open(fam_file_path, 'w') for indiv in individuals: fields = [ family.family_id, indiv.indiv_id, indiv.paternal_id if indiv.paternal_id else '.', indiv.maternal_id if indiv.maternal_id else '.', '2' if indiv.gender == 'F' else ('1' if indiv.gender == 'F' else '0'), '2' if indiv.affected == 'A' else ('1' if indiv.affected == 'N' else '0'), ] f.write('\t'.join(fields)+'\n') f.close() # markers.txt markers_path = os.path.join(output_dir_path, 'markers.txt') shutil.copy(settings.COMMON_SNP_FILE, markers_path) # disease model disease_model_path = os.path.join(output_dir_path, 'disease_model.txt') f = open(disease_model_path, 'w') f.writelines([ "DD\t.001\n", "Dd\t.001\n", "dd\t.999\n", ]) f.close() # variants.txt variants_file_path = os.path.join(output_dir_path, 'variants.txt') f = open(variants_file_path, 'w') f.write('#CHR\tPOS\tREF\tALT') for indiv in individuals: f.write('\t'+indiv.indiv_id) f.write('\n') for _line in open(settings.COMMON_SNP_FILE): fields = _line.strip('\n').split('\t') xpos = genomeloc.get_single_location('chr'+fields[0], int(fields[1])) ref = fields[2] alt = fields[3] variant = get_mall().variant_store.get_single_variant(family.project.project_id, family.family_id, xpos, ref, alt) fields = [ fields[0], fields[1], fields[2], fields[3], ] for indiv in individuals: if variant: genotype = variant.get_genotype(indiv.indiv_id) fields.append(str(genotype.num_alt) if genotype.num_alt is not None else '.') else: fields.append('0') f.write('\t'.join(fields)+'\n') f.close()
def handle(self, *args, **options): project_id = options['project_id'] print("Loading data into project: " + project_id) project = Project.objects.get(project_id=project_id) cnv_filename = options['cnv_filename'] bed_files_directory = options['bed_files_directory'] if not os.path.isfile(cnv_filename): raise ValueError("CNV file %s doesn't exist" % options['cnv_filename']) with open(cnv_filename) as f: header_fields = f.readline().rstrip('\n').split('\t') for line in f: fields = line.rstrip('\n').split('\t') row_dict = dict(zip(header_fields, fields)) chrom = "chr" + row_dict['chr'] start = int(row_dict['start']) end = int(row_dict['end']) #left_overhang = int(row_dict['left_overhang_start']) #right_overhang = int(row_dict['right_overhang_end']) sample_id = row_dict['sample'] try: i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id) except Exception as e: print("WARNING: %s: %s not found in %s" % (e, sample_id, project)) continue bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id) if not os.path.isfile(bed_file_path): print("WARNING: .bed file not found: " + bed_file_path) if i.cnv_bed_file != bed_file_path: print("Setting cnv_bed_file path to %s" % bed_file_path) i.cnv_bed_file = bed_file_path i.save() project_collection = get_project_datastore( project)._get_project_collection(project_id) family_collection = get_mall( project).variant_store._get_family_collection( project_id, i.family.family_id) for collection in filter( None, [project_collection, family_collection]): collection.update_many( { '$and': [{ 'xpos': { '$gte': genomeloc.get_single_location( chrom, start) } }, { 'xpos': { '$lte': genomeloc.get_single_location(chrom, end) } }] }, { '$set': { 'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict } }) #result = list(collection.find({'$and' : [ # {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)}}, # {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]}, # {'genotypes.%s.extras.cnvs' % i.indiv_id :1 })) #print(chrom, start, end, len(result), result[0] if result else None) print("Done")
def handle_individual(self, project, individual): project_id = project.project_id individual_id = individual.indiv_id print("Processing individual %s" % individual_id) # get variants that have been tagged or that have a note that starts with "REPORT" variants_in_report_and_notes = defaultdict(str) for vt in VariantTag.objects.filter(project_tag__project=project, project_tag__tag="REPORT", family=individual.family): variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = "" for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and vn.note.strip().startswith("REPORT"): variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = "" header = [ "gene_name", "genotype", "variant", "functional_class", "hgvs_c", "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af", "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars", "clinvar_url", "comments" ] if len(variants_in_report_and_notes) != 0: with open( "report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out: #print("\t".join(header)) out.write("\t".join(header) + "\n") # retrieve text of all notes that were left for any of these variants for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes: other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] if len(other_notes) > 0: other_notes += "||" variants_in_report_and_notes[( vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % ( vn.date_saved, vn.user.email, vn.note.strip()) for (xpos, ref, alt), notes in variants_in_report_and_notes.items(): #chrom, pos = genomeloc.get_chr_pos(xpos) v = get_mall(project).variant_store.get_single_variant( project_id, individual.family.family_id, xpos, ref, alt) if v is None: print( "Rerieving variant from previous callset version (MYOSEQ_v20_previous1)" ) v = get_mall(project).variant_store.get_single_variant( 'MYOSEQ_v20_previous1', individual.family.family_id, xpos, ref, alt) if v is None: raise ValueError( "Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt)) row = self.get_output_row(v, xpos, ref, alt, individual_id, individual.family, all_fields=True, comments=notes) if row is None: continue out.write("\t".join(row) + "\n") with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out: header = ["gene_chrom", "gene_start", "gene_end" ] + header + ["json_dump"] out.write("\t".join(header) + "\n") for gene_id, (chrom, start, end) in gene_loc.items(): xpos_start = genomeloc.get_single_location( "chr" + chrom, start) xpos_end = genomeloc.get_single_location("chr" + chrom, end) variant_filter = VariantFilter(locations=[(xpos_start, xpos_end)]) for v in get_mall(project).variant_store.get_variants( project_id, individual.family.family_id, variant_filter=variant_filter): json_dump = str(v.genotypes) try: notes = variants_in_report_and_notes[(v.xpos, v.ref, v.alt)] except KeyError: notes = "" row = self.get_output_row(v, v.xpos, v.ref, v.alt, individual_id, individual.family, comments=notes, gene_id=gene_id) if row is None: continue row = map(str, ["chr" + chrom.replace("chr", ""), start, end] + row + [json_dump]) out.write("\t".join(row) + "\n")
def get_gene_structure(self, gene_id): """ Query ensembl API for the transcript/exon structure of a gene This is the foundation of the elements in db.genes Exception if can't process gene """ gene = {} # gene basics url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'gene'} gene_list_json = requests.get(url, params=params).json() gene_list_json = [ item for item in gene_list_json if item['ID'] == gene_id ] if len(gene_list_json) == 0: raise Exception("No genes with ID %s" % gene_id) if len(gene_list_json) > 1: raise Exception(">1 ensembl genes with ID %s" % gene_id) gene_json = gene_list_json[0] chr = ensembl_parsing_utils.get_chr_from_seq_region_name( gene_json['seq_region_name']) if chr is None: raise Exception("Gene %s is on a nonstandard chromosome: %s" % (gene_id, chr)) gene['chr'] = chr gene['start'] = gene_json['start'] gene['stop'] = gene_json['end'] gene['xstart'] = genomeloc.get_single_location(chr, gene['start']) gene['xstop'] = genomeloc.get_single_location(chr, gene['stop']) gene['gene_id'] = gene_json['ID'] gene['symbol'] = gene_json['external_name'] gene['description'] = gene_json['description'] gene['biotype'] = gene_json['biotype'] # transcripts url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'transcript'} transcript_json = [ t for t in requests.get(url, params=params).json() if t['Parent'] == gene_id ] gene['transcripts'] = [] for t in transcript_json: transcript_id = t['ID'] transcript = dict(transcript_id=transcript_id, biotype=t['biotype'], start=t['start'], stop=t['end']) transcript['xstart'] = genomeloc.get_single_location( chr, transcript['start']) transcript['xstop'] = genomeloc.get_single_location( chr, transcript['stop']) # exons_for_transcript url = self._get_rest_url() + '/feature/id/%s' % transcript_id params = {'content-type': 'application/json', 'feature': 'exon'} transcript_exon_json = requests.get(url, params=params).json() transcript['exons'] = [ e['ID'] for e in sorted(transcript_exon_json, key=lambda x: x['start']) if e['Parent'] == transcript_id ] gene['transcripts'].append(transcript) # exons url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'exon'} exon_json = requests.get(url, params=params).json() transcript_ids = {t['transcript_id'] for t in gene['transcripts']} exon_ids_seen = set() gene['exons'] = [] for e in exon_json: exon_id = e['ID'] # skip exons that aren't actually in one of this gene's transcripts if e['Parent'] not in transcript_ids: continue if exon_id in exon_ids_seen: continue exon = { 'exon_id': exon_id, 'start': e['start'], 'stop': e['end'], } exon['xstart'] = genomeloc.get_single_location(chr, exon['start']) exon['xstop'] = genomeloc.get_single_location(chr, exon['stop']) gene['exons'].append(exon) exon_ids_seen.add(e['ID']) # cds url = self._get_rest_url() + '/feature/id/%s' % gene_id params = {'content-type': 'application/json', 'feature': 'cds'} cds_json = requests.get(url, params=params).json() cds_map = {} # map from (start, stop) -> {start, stop, transcripts} for c in cds_json: # skip exons that aren't actually in one of this gene's transcripts if c['Parent'] not in transcript_ids: continue cds_t = (c['start'], c['end']) if cds_t not in cds_map: cds_map[cds_t] = { 'start': c['start'], 'stop': c['end'], 'xstart': genomeloc.get_single_location(chr, c['start']), 'xstop': genomeloc.get_single_location(chr, c['end']), 'transcripts': [], } cds_map[cds_t]['transcripts'].append(c['Parent']) gene['cds'] = sorted(cds_map.values(), key=lambda x: (x['start'], x['stop'])) for i, cds in enumerate(gene['cds']): cds['cds_id'] = '%s-%i' % (gene['gene_id'], i + 1) return gene