def __init__(self, seq_repo_path=None, regions_preload=None, preload_pos_margin=500, assembly_name=None): ''' :param seq_repo_path: Path to local seqrepo directory. If None, read HGVS_SEQREPO_DIR environment variable :param regions_preload: Iterable[ChrInterval], optionally preload these genomic regions :param preload_pos_margin: adding margin at the end of a preloaded genome in order to have data to verify structural variants across the end of a gene ''' if not seq_repo_path: seq_repo_path = os.environ.get("HGVS_SEQREPO_DIR") if seq_repo_path: seq_repo = SeqRepo(seq_repo_path) self.seq_repo_fetcher = seq_repo.fetch else: logging.warn("Using remote sequence provider.") self.seq_repo_fetcher = seqfetcher.fetch_seq self.assembly_name = assembly_name if not self.assembly_name: self.assembly_name = self.DEFAULT_ASSY_NAME self.assy_map = assemblies.make_name_ac_map(self.assembly_name) self.preloaded_regions = {} if regions_preload: self.preloaded_regions = build_interval_trees_by_chr( regions_preload, lambda c, s, e: self._fetch_seq(c, s, e + preload_pos_margin))
def convert_first_variants_of_son_into_HGVS(self): ''' Convert the first 100 variants identified in the son into the corresponding transcript HGVS. Each variant should be mapped to all corresponding transcripts. Pointer: - https://hgvs.readthedocs.io/en/master/examples/manuscript-example.html#project-genomic-variant-to-a-new-transcript :return: ''' print "\n---------------\nConverting first 100 variants of son to HGVS.." vcf_readerson = vcf.Reader(open(self.vcf_son, 'r')) output_file = open("hgvs_file", "w") proccessed_variants = 0 succ_proc_variants = 0 exceptions = 0 # UTA Verbindung uta = hgvs.dataproviders.uta.connect() assembly_mapper = hgvs.assemblymapper.AssemblyMapper(uta, normalize=False) # Parsing hgvs_parser = hgvs.parser.Parser() for read in vcf_readerson: if proccessed_variants < 100: refseq_nc_number = make_name_ac_map("GRCh37.p13")[read.CHROM[3:]] genome_hgvs ="{}:g.{}{}>{}".format(refseq_nc_number, str(read.POS), str(read.REF), str(read.ALT[0])) try: hgvs_variant = hgvs_parser.parse_hgvs_variant(genome_hgvs) for transcript in assembly_mapper.relevant_transcripts(hgvs_variant): try: coding = assembly_mapper.g_to_c(hgvs_variant, transcript) succ_proc_variants += 1 print "{}\t{}".format(hgvs_variant, coding) output_file.write("{}\t{}".format(hgvs_variant, coding)) except hgvs.exceptions.HGVSUsageError: noncoding = assembly_mapper.g_to_n(hgvs_variant, transcript) succ_proc_variants += 1 print "{}\t{}".format(hgvs_variant, noncoding) output_file.write("{}\t{}".format(hgvs_variant, noncoding)) except: exceptions += 1 except Exception: exceptions += 1 else: break proccessed_variants += 1 output_file.close() print "Successful conversions: %s" % (succ_proc_variants) print "Exceptions occurred: %s" % (exceptions)
def generate_chrome_dic(annotation) -> dict: # 染色体对应关系字典, GRCh37: chr1 or 1 -> NC_000001.10, GRCh38: chr1 or 1 -> NC_000001.11 chrome_dic = make_name_ac_map(annotation) chromes = [str(j) for j in range(1, 23)] + ['X', 'Y'] for chrome in chromes: chrome_dic['chr' + chrome] = chrome_dic[chrome] if 'MT' not in chrome_dic: chrome_dic['MT'] = 'NC_012920.1' chrome_dic['chrMT'] = chrome_dic['MT'] chrome_dic['chrM_NC_012920.1'] = chrome_dic['MT'] return chrome_dic
def convert_first_variants_of_son_into_HGVS(self): print("converting first 100 varaints of son to HGFS format") ## Connect to UTA hdp = hgvs.dataproviders.uta.connect() logging.basicConfig() assembly_mapper = hgvs.assemblymapper.AssemblyMapper( hdp, normalize=False) # EasyVariantMapper before ## Used for parsing hgvsparser = hgvs.parser.Parser() # Parser reader = vcf.Reader(open(self.sonFile, 'rb')) outfile = open("first_100_variants_son.out", "w") def mapping(genome_hgvs): g = hgvsparser.parse_hgvs_variant(genome_hgvs) for tr in assembly_mapper.relevant_transcripts(g): try: c = assembly_mapper.g_to_c(g, tr) # coding outfile.writelines("%s\t%s\n" % (g, c)) except hgvs.exceptions.HGVSUsageError: n = assembly_mapper.g_to_n(g, tr) # non coding outfile.writelines("%s\t%s\n" % (g, n)) except hgvs.exceptions.HGVSInvalidIntervalError: outfile.writelines("mapping error at %s\t%s\n" % (g, tr)) limit = 100 count = 0 for record in reader: if count < limit: refseq_nc_number = make_name_ac_map("GRCh37.p13")[ record.CHROM[3:]] try: genome_hgvs = "%s:g.%s%s>%s" % ( refseq_nc_number, str(record.POS), str( record.REF), str(record.ALT[0])) mapping(genome_hgvs) except Exception as e: print("caught exception", e) else: break count += 1 print( "Wrote first 100 variants of son file into file '" ' first_100_variants_son.out' "' ") outfile.close()
def convert_first_variants_of_son_into_HGVS(self): z = 0 # zaehler fuer 100 variants z_ok = 0 # zaehler fuer erfolgreiche conversions z_exceptions = 0 # zaehler fuer exceptions ## Connect to UTA hdp = hgvs.dataproviders.uta.connect() assembly_mapper = hgvs.assemblymapper.AssemblyMapper( hdp, normalize=False ) # EasyVariantMapper before, normalize=False, um Warning zu beseitigen ## Used for parsing hgvsparser = hgvs.parser.Parser() # Parser vcf_reader_s = vcf.Reader(open(self.son, 'r')) # reader wie oben for r in vcf_reader_s: if z < 100: refseq_nc_number = make_name_ac_map("GRCh37.p13")[r.CHROM[3:]] genome_hgvs = "%s:g.%s%s>%s" % (refseq_nc_number, str( r.POS), str(r.REF), str(r.ALT[0])) try: g = hgvsparser.parse_hgvs_variant(genome_hgvs) for t in assembly_mapper.relevant_transcripts(g): try: c = assembly_mapper.g_to_c( g, t ) # c: coding DNA reference sequence, g: genomic reference sequence z_ok += 1 print("%s\t%s" % (g, c)) except hgvs.exceptions.HGVSUsageError: n = assembly_mapper.g_to_n( g, t ) # n: non-coding RNA reference sequence (gene producing an RNA transcript but not a protein) z_ok += 1 print("%s\t%s" % (g, n)) except: z_exceptions += 1 except Exception: z_exceptions += 1 else: break z += 1 # Summary ausgeben print("Successful conversions: %s" % (z_ok)) print("Exceptions occurred: %s" % (z_exceptions))
def convert_first_variants_of_son_into_HGVS(self): self.file_son = vcf.Reader(open(self.filename_son, 'r')) #https://hgvs.readthedocs.io/en/master/examples/manuscript-example.html#project-genomic-variant-to-a-new-transcript hp = hgvs.dataproviders.uta.connect( ) #connect to uta and get transcripts assembly_mapper = hgvs.assemblymapper.AssemblyMapper( hp, normalize=False) #set the EasyVariantMapper hgvsparser = hgvs.parser.Parser() #for parsing hgvs files nr = 0 succ = 0 exc = 0 for record in self.file_son: file = open('100VSon.hgvs', 'a') #a for append if nr < 100: #set the max to be converted to 100 refseq = make_name_ac_map("GRCh37.p13")[record.CHROM[ 3:]] #nc_number :g. position reference > alternative genome_hgvs = "%s:g.%s%s>%s" % (refseq, str( record.POS), str(record.REF), str(record.ALT[0])) try: genome = hgvsparser.parse_hgvs_variant( genome_hgvs ) #a parser of the genome is saved as genome for transcript in assembly_mapper.relevant_transcripts( genome): try: #coding coding = assembly_mapper.g_to_c(genome, transcript) succ += 1 file.write( "Number of variant: %s\n%s corresponds to the coding sequence %s\n" % (nr + 1, genome, coding)) except hgvs.exceptions.HGVSUsageError: #non coding noncoding = assembly_mapper.g_to_n( genome, transcript) succ += 1 file.write( "Number of variant: %s\n%s corresponds to the noncoding sequence %s\n" % (nr + 1, genome, noncoding)) except: #if neither coding nor non coding, then its an exception exc += 1 except Exception: exc += 1 else: break nr += 1 # nr grows by one for each loop so that we only end up with 100 variants return "Number of successfull mappings: {}\n".format( succ), "Number of exceptions: {}".format(exc)
def convert_first_variants_of_son_into_HGVS(self): hdp = hgvs.dataproviders.uta.connect() vm = hgvs.variantmapper.VariantMapper(hdp) count=0 # Used for parsing hgvsparser = hgvs.parser.Parser() # Parser file=open('son_100.hgvs', 'a') for entry in self.son_vcf: count+=1 print(str(entry.CHROM) + ' ' + str(entry.POS) + ' ' + str(entry.QUAL)) if count == 3: break NC_no = make_name_ac_map("GRCh37.p13")[entry.CHROM] #print(NC_no) print("Starting conversion. Please wait.") return
def convert_first_variants_of_son_into_HGVS(self): ''' Convert the first 100 variants identified in the son into the corresponding transcript HGVS. Each variant should be mapped to all corresponding transcripts. Pointer: - https://hgvs.readthedocs.io/en/master/examples/manuscript-example.html#project-genomic-variant-to-a-new-transcript :return: mapping of variant to corresponding transcripts ''' ## von SPabinger so uebernommen: ## Connect to UTA hdp = hgvs.dataproviders.uta.connect() ## Used to get the transcripts # normalize = False wird genommen um die Warnung zu unterdruecken assembly_mapper = hgvs.assemblymapper.AssemblyMapper( hdp, normalize=False) # EasyVariantMapper before ## Used for parsing hgvsparser = hgvs.parser.Parser() # Parser ## Oeffnen des Streams fuer den Sohn: self.file_son = vcf.Reader(open(self.filename_son, 'r')) anzahl = 0 success = 0 exception = 0 print("Starting conversion. Please wait.") for record in self.file_son: ## Oeffnen einer Datei, damit das Ergebnis in einer Datei steht. ## Mode = a fuer append, damit die Zeilen angefuegt und nicht ueberschrieben werden file = open('100variants.hgvs', 'a') if anzahl < 100: ## Get chromosome mapping refseq_nc_number = make_name_ac_map("GRCh37.p13")[ record.CHROM[3:]] ## Format: nc_number :g. position reference > alternative genome_hgvs = "%s:g.%s%s>%s" % ( refseq_nc_number, str(record.POS), str( record.REF), str(record.ALT[0])) try: genom = hgvsparser.parse_hgvs_variant(genome_hgvs) for transcript in assembly_mapper.relevant_transcripts( genom): try: ## ist es eine codierende Sequenz? coding = assembly_mapper.g_to_c(genom, transcript) success += 1 file.write( "Number of variant: %s\n%s corresponds to the coding sequence %s\n" % (anzahl + 1, genom, coding)) except hgvs.exceptions.HGVSUsageError: ## ist es keine codierende Sequenz? noncoding = assembly_mapper.g_to_n( genom, transcript) success += 1 file.write( "Number of variant: %s\n%s corresponds to the noncoding sequence %s\n" % (anzahl + 1, genom, noncoding)) except: ## ansonsten ist es eine exception exception += 1 except Exception: exception += 1 else: ## sobald die ersten 100 Varianten durch sind, abbrechen break ## jede Runde wird die Anzahl um 1 erhoeht. anzahl += 1 ## eine kleine Hilfe, die anzeigt wie weit wir schon sind. if anzahl == 10: print("Conversion is at 10%") if anzahl == 25: print("Conversion is at 25%") if anzahl == 50: print("Conversion is at 50%") if anzahl == 75: print("Conversion is at 75%") if anzahl == 90: print("Conversion is at 90%") print("Number of successfull mappings: {}\n" "Number of exceptions: {}".format(success, exception))