Exemple #1
0
    def __init__(self,
                 seq_repo_path=None,
                 regions_preload=None,
                 preload_pos_margin=500,
                 assembly_name=None):
        '''
        :param seq_repo_path: Path to local seqrepo directory. If None, read HGVS_SEQREPO_DIR environment variable
        :param regions_preload: Iterable[ChrInterval], optionally preload these genomic regions
        :param preload_pos_margin: adding margin at the end of a preloaded genome
          in order to have data to verify structural variants across the end of a gene
        '''

        if not seq_repo_path:
            seq_repo_path = os.environ.get("HGVS_SEQREPO_DIR")

        if seq_repo_path:
            seq_repo = SeqRepo(seq_repo_path)
            self.seq_repo_fetcher = seq_repo.fetch
        else:
            logging.warn("Using remote sequence provider.")
            self.seq_repo_fetcher = seqfetcher.fetch_seq

        self.assembly_name = assembly_name
        if not self.assembly_name:
            self.assembly_name = self.DEFAULT_ASSY_NAME
        self.assy_map = assemblies.make_name_ac_map(self.assembly_name)

        self.preloaded_regions = {}
        if regions_preload:
            self.preloaded_regions = build_interval_trees_by_chr(
                regions_preload,
                lambda c, s, e: self._fetch_seq(c, s, e + preload_pos_margin))
    def convert_first_variants_of_son_into_HGVS(self):
        '''
        Convert the first 100 variants identified in the son into the corresponding transcript HGVS.
        Each variant should be mapped to all corresponding transcripts. Pointer:
        - https://hgvs.readthedocs.io/en/master/examples/manuscript-example.html#project-genomic-variant-to-a-new-transcript
        :return:
        '''

        print "\n---------------\nConverting first 100 variants of son to HGVS.."

        vcf_readerson = vcf.Reader(open(self.vcf_son, 'r'))
        output_file = open("hgvs_file", "w")

        proccessed_variants = 0
        succ_proc_variants = 0
        exceptions = 0

        # UTA Verbindung
        uta = hgvs.dataproviders.uta.connect()
        assembly_mapper = hgvs.assemblymapper.AssemblyMapper(uta, normalize=False)

        # Parsing
        hgvs_parser = hgvs.parser.Parser()

        for read in vcf_readerson:
            if proccessed_variants < 100:
                refseq_nc_number = make_name_ac_map("GRCh37.p13")[read.CHROM[3:]]
                genome_hgvs ="{}:g.{}{}>{}".format(refseq_nc_number, str(read.POS), str(read.REF), str(read.ALT[0]))

                try:
                    hgvs_variant = hgvs_parser.parse_hgvs_variant(genome_hgvs)
                    for transcript in assembly_mapper.relevant_transcripts(hgvs_variant):
                        try:
                            coding = assembly_mapper.g_to_c(hgvs_variant, transcript)
                            succ_proc_variants += 1
                            print "{}\t{}".format(hgvs_variant, coding)
                            output_file.write("{}\t{}".format(hgvs_variant, coding))

                        except hgvs.exceptions.HGVSUsageError:
                            noncoding = assembly_mapper.g_to_n(hgvs_variant, transcript)
                            succ_proc_variants += 1
                            print "{}\t{}".format(hgvs_variant, noncoding)
                            output_file.write("{}\t{}".format(hgvs_variant, noncoding))

                        except:
                            exceptions += 1

                except Exception:
                    exceptions += 1

            else:
                break

            proccessed_variants += 1

        output_file.close()

        print "Successful conversions: %s" % (succ_proc_variants)
        print "Exceptions occurred: %s" % (exceptions)
Exemple #3
0
def generate_chrome_dic(annotation) -> dict:
    # 染色体对应关系字典, GRCh37: chr1 or 1 -> NC_000001.10, GRCh38: chr1 or 1 -> NC_000001.11
    chrome_dic = make_name_ac_map(annotation)
    chromes = [str(j) for j in range(1, 23)] + ['X', 'Y']
    for chrome in chromes:
        chrome_dic['chr' + chrome] = chrome_dic[chrome]
    if 'MT' not in chrome_dic:
        chrome_dic['MT'] = 'NC_012920.1'
    chrome_dic['chrMT'] = chrome_dic['MT']
    chrome_dic['chrM_NC_012920.1'] = chrome_dic['MT']
    return chrome_dic
    def convert_first_variants_of_son_into_HGVS(self):

        print("converting first 100 varaints of son to HGFS format")

        ## Connect to UTA
        hdp = hgvs.dataproviders.uta.connect()
        logging.basicConfig()
        assembly_mapper = hgvs.assemblymapper.AssemblyMapper(
            hdp, normalize=False)  # EasyVariantMapper before
        ## Used for parsing
        hgvsparser = hgvs.parser.Parser()  # Parser

        reader = vcf.Reader(open(self.sonFile, 'rb'))
        outfile = open("first_100_variants_son.out", "w")

        def mapping(genome_hgvs):
            g = hgvsparser.parse_hgvs_variant(genome_hgvs)
            for tr in assembly_mapper.relevant_transcripts(g):
                try:
                    c = assembly_mapper.g_to_c(g, tr)  # coding
                    outfile.writelines("%s\t%s\n" % (g, c))
                except hgvs.exceptions.HGVSUsageError:
                    n = assembly_mapper.g_to_n(g, tr)  # non coding
                    outfile.writelines("%s\t%s\n" % (g, n))
                except hgvs.exceptions.HGVSInvalidIntervalError:
                    outfile.writelines("mapping error at %s\t%s\n" % (g, tr))

        limit = 100
        count = 0

        for record in reader:
            if count < limit:
                refseq_nc_number = make_name_ac_map("GRCh37.p13")[
                    record.CHROM[3:]]
                try:
                    genome_hgvs = "%s:g.%s%s>%s" % (
                        refseq_nc_number, str(record.POS), str(
                            record.REF), str(record.ALT[0]))
                    mapping(genome_hgvs)
                except Exception as e:
                    print("caught exception", e)
            else:
                break

            count += 1

        print(
            "Wrote first 100 variants of son file into file '"
            ' first_100_variants_son.out'
            "' ")
        outfile.close()
    def convert_first_variants_of_son_into_HGVS(self):

        z = 0  # zaehler fuer 100 variants
        z_ok = 0  # zaehler fuer erfolgreiche conversions
        z_exceptions = 0  # zaehler fuer exceptions

        ## Connect to UTA
        hdp = hgvs.dataproviders.uta.connect()

        assembly_mapper = hgvs.assemblymapper.AssemblyMapper(
            hdp, normalize=False
        )  # EasyVariantMapper before, normalize=False, um Warning zu beseitigen
        ## Used for parsing
        hgvsparser = hgvs.parser.Parser()  # Parser

        vcf_reader_s = vcf.Reader(open(self.son, 'r'))  # reader wie oben

        for r in vcf_reader_s:
            if z < 100:
                refseq_nc_number = make_name_ac_map("GRCh37.p13")[r.CHROM[3:]]
                genome_hgvs = "%s:g.%s%s>%s" % (refseq_nc_number, str(
                    r.POS), str(r.REF), str(r.ALT[0]))

                try:
                    g = hgvsparser.parse_hgvs_variant(genome_hgvs)
                    for t in assembly_mapper.relevant_transcripts(g):
                        try:
                            c = assembly_mapper.g_to_c(
                                g, t
                            )  # c: coding DNA reference sequence, g: genomic reference sequence
                            z_ok += 1
                            print("%s\t%s" % (g, c))
                        except hgvs.exceptions.HGVSUsageError:
                            n = assembly_mapper.g_to_n(
                                g, t
                            )  # n: non-coding RNA reference sequence (gene producing an RNA transcript but not a protein)
                            z_ok += 1
                            print("%s\t%s" % (g, n))
                        except:
                            z_exceptions += 1
                except Exception:
                    z_exceptions += 1

            else:
                break

            z += 1

        # Summary ausgeben
        print("Successful conversions: %s" % (z_ok))
        print("Exceptions occurred: %s" % (z_exceptions))
Exemple #6
0
    def convert_first_variants_of_son_into_HGVS(self):
        self.file_son = vcf.Reader(open(self.filename_son, 'r'))
        #https://hgvs.readthedocs.io/en/master/examples/manuscript-example.html#project-genomic-variant-to-a-new-transcript
        hp = hgvs.dataproviders.uta.connect(
        )  #connect to uta and get transcripts
        assembly_mapper = hgvs.assemblymapper.AssemblyMapper(
            hp, normalize=False)  #set the EasyVariantMapper
        hgvsparser = hgvs.parser.Parser()  #for parsing hgvs files
        nr = 0
        succ = 0
        exc = 0
        for record in self.file_son:
            file = open('100VSon.hgvs', 'a')  #a for append
            if nr < 100:  #set the max to be converted to 100
                refseq = make_name_ac_map("GRCh37.p13")[record.CHROM[
                    3:]]  #nc_number :g. position reference > alternative
                genome_hgvs = "%s:g.%s%s>%s" % (refseq, str(
                    record.POS), str(record.REF), str(record.ALT[0]))
                try:
                    genome = hgvsparser.parse_hgvs_variant(
                        genome_hgvs
                    )  #a parser of the genome is saved as genome
                    for transcript in assembly_mapper.relevant_transcripts(
                            genome):
                        try:
                            #coding
                            coding = assembly_mapper.g_to_c(genome, transcript)
                            succ += 1
                            file.write(
                                "Number of variant: %s\n%s corresponds to the coding sequence %s\n"
                                % (nr + 1, genome, coding))
                        except hgvs.exceptions.HGVSUsageError:
                            #non coding
                            noncoding = assembly_mapper.g_to_n(
                                genome, transcript)
                            succ += 1
                            file.write(
                                "Number of variant: %s\n%s corresponds to the noncoding sequence %s\n"
                                % (nr + 1, genome, noncoding))
                        except:  #if neither coding nor non coding, then its an exception
                            exc += 1
                except Exception:
                    exc += 1
            else:
                break

            nr += 1  # nr grows by one for each loop so that we only end up with 100 variants
        return "Number of successfull mappings: {}\n".format(
            succ), "Number of exceptions: {}".format(exc)
    def convert_first_variants_of_son_into_HGVS(self):
        hdp = hgvs.dataproviders.uta.connect()
        vm = hgvs.variantmapper.VariantMapper(hdp)
        count=0
        # Used for parsing
        hgvsparser = hgvs.parser.Parser()  # Parser
        file=open('son_100.hgvs', 'a')
        for entry in self.son_vcf:
            count+=1
            print(str(entry.CHROM) + ' ' + str(entry.POS) + ' ' + str(entry.QUAL))
            if count == 3:
                break
            NC_no = make_name_ac_map("GRCh37.p13")[entry.CHROM]
            #print(NC_no)

        print("Starting conversion. Please wait.")
        return
Exemple #8
0
    def convert_first_variants_of_son_into_HGVS(self):
        '''
        Convert the first 100 variants identified in the son into the corresponding transcript HGVS.
        Each variant should be mapped to all corresponding transcripts. Pointer:
        - https://hgvs.readthedocs.io/en/master/examples/manuscript-example.html#project-genomic-variant-to-a-new-transcript
        :return: mapping of variant to corresponding transcripts
        '''
        ## von SPabinger so uebernommen:
        ## Connect to UTA
        hdp = hgvs.dataproviders.uta.connect()

        ## Used to get the transcripts
        # normalize = False wird genommen um die Warnung zu unterdruecken
        assembly_mapper = hgvs.assemblymapper.AssemblyMapper(
            hdp, normalize=False)  # EasyVariantMapper before

        ## Used for parsing
        hgvsparser = hgvs.parser.Parser()  # Parser

        ## Oeffnen des Streams fuer den Sohn:
        self.file_son = vcf.Reader(open(self.filename_son, 'r'))

        anzahl = 0
        success = 0
        exception = 0

        print("Starting conversion. Please wait.")

        for record in self.file_son:
            ## Oeffnen einer Datei, damit das Ergebnis in einer Datei steht.
            ## Mode = a fuer append, damit die Zeilen angefuegt und nicht ueberschrieben werden
            file = open('100variants.hgvs', 'a')
            if anzahl < 100:
                ## Get chromosome mapping
                refseq_nc_number = make_name_ac_map("GRCh37.p13")[
                    record.CHROM[3:]]
                ## Format: nc_number :g. position reference > alternative
                genome_hgvs = "%s:g.%s%s>%s" % (
                    refseq_nc_number, str(record.POS), str(
                        record.REF), str(record.ALT[0]))
                try:
                    genom = hgvsparser.parse_hgvs_variant(genome_hgvs)
                    for transcript in assembly_mapper.relevant_transcripts(
                            genom):
                        try:
                            ## ist es eine codierende Sequenz?
                            coding = assembly_mapper.g_to_c(genom, transcript)
                            success += 1
                            file.write(
                                "Number of variant: %s\n%s corresponds to the coding sequence %s\n"
                                % (anzahl + 1, genom, coding))
                        except hgvs.exceptions.HGVSUsageError:
                            ## ist es keine codierende Sequenz?
                            noncoding = assembly_mapper.g_to_n(
                                genom, transcript)
                            success += 1
                            file.write(
                                "Number of variant: %s\n%s corresponds to the noncoding sequence %s\n"
                                % (anzahl + 1, genom, noncoding))
                        except:
                            ## ansonsten ist es eine exception
                            exception += 1
                except Exception:
                    exception += 1

            else:
                ## sobald die ersten 100 Varianten durch sind, abbrechen
                break
            ## jede Runde wird die Anzahl um 1 erhoeht.
            anzahl += 1
            ## eine kleine Hilfe, die anzeigt wie weit wir schon sind.
            if anzahl == 10:
                print("Conversion is at 10%")
            if anzahl == 25:
                print("Conversion is at 25%")
            if anzahl == 50:
                print("Conversion is at 50%")
            if anzahl == 75:
                print("Conversion is at 75%")
            if anzahl == 90:
                print("Conversion is at 90%")

        print("Number of successfull mappings: {}\n"
              "Number of exceptions: {}".format(success, exception))