def genotyping(read_dir,
               reference_type,
               region_list,
               num_editdist,
               nthreads,
               max_sample,
               assembly,
               out_dir,
               verbose):
    for database_name in region_list:
        # Extract variants, backbone sequence, and other sequeces
        typing_common.extract_database_if_not_exists(database_name,
                                                     [])            # locus_list
        # Build HISAT2's graph index
        typing_common.build_index_if_not_exists(database_name,
                                                "hisat2",
                                                "graph",
                                                1,            # threads
                                                verbose)
    
    if not os.path.exists(read_dir):
        print >> sys.stderr, "Error: %s does not exist." % read_dir
        sys.exit(1)

    if out_dir != "" and not os.path.exists(out_dir):
        os.mkdir(out_dir)        

    # fastq files
    fq_fnames = glob.glob("%s/*.extracted.1.fq.gz" % read_dir)

    lock = threading.Lock()
    threads = []
    for t in range(nthreads):
        thread = myThread(lock,
                          fq_fnames,
                          reference_type,
                          region_list,
                          num_editdist,
                          max_sample,
                          assembly,
                          out_dir,
                          verbose)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()
def build_genotype_genome(base_fname, inter_gap, intra_gap, threads,
                          database_list, use_clinvar, use_commonvar, aligner,
                          graph_index, verbose):
    # Download HISAT2 index
    typing_common.download_genome_and_index()

    # Load genomic sequences
    chr_dic, chr_names, chr_full_names = typing_common.read_genome("genome.fa")

    genotype_vars = {}
    genotype_haplotypes = {}
    genotype_clnsig = {}
    if use_clinvar:
        # Extract variants from the ClinVar database
        CLINVAR_fnames = [
            "clinvar.vcf.gz", "clinvar.snp", "clinvar.haplotype",
            "clinvar.clnsig"
        ]

        if not typing_common.check_files(CLINVAR_fnames):
            if not os.path.exists("clinvar.vcf.gz"):
                os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/"\
                            "vcf_GRCh38/archive/2017/clinvar_20170404.vcf.gz")
            assert os.path.exists("clinvar.vcf.gz")

            extract_cmd = ["hisat2_extract_snps_haplotypes_VCF.py"]
            extract_cmd += [
                "--inter-gap",
                str(inter_gap), "--intra-gap",
                str(intra_gap), "--genotype-vcf", "clinvar.vcf.gz",
                "genome.fa", "/dev/null", "clinvar"
            ]
            if verbose:
                print("\tRunning:", ' '.join(extract_cmd), file=sys.stderr)
            proc = subprocess.Popen(extract_cmd,
                                    stdout=open("/dev/null", 'w'),
                                    stderr=open("/dev/null", 'w'))
            proc.communicate()
            if not typing_common.check_files(CLINVAR_fnames):
                print("Error: extract variants from clinvar failed!",
                      file=sys.stderr)
                sys.exit(1)

        # Read variants to be genotyped
        genotype_vars = typing_common.read_variants("clinvar.snp")

        # Read haplotypes
        genotype_haplotypes = typing_common.read_haplotypes(
            "clinvar.haplotype")

        # Read information about clinical significance
        genotype_clnsig = read_clnsig("clinvar.clnsig")

    if use_commonvar:
        # Extract variants from dbSNP database
        # TODO: CB Write script to make local uptodate SNP database from dbSNP
        # ftp://ftp.ncbi.nlm.nih.gov/snp/database/README.create_local_dbSNP.txt
        commonvar_fbase = "snp144Common"
        commonvar_fnames = [
            "%s.snp" % commonvar_fbase,
            "%s.haplotype" % commonvar_fbase
        ]
        if not typing_common.check_files(commonvar_fnames):
            if not os.path.exists("%s.txt.gz" % commonvar_fbase):
                os.system("wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/"\
                               "database/%s.txt.gz" % commonvar_fbase)
            assert os.path.exists("%s.txt.gz" % commonvar_fbase)
            os.system("gzip -cd %s.txt.gz "\
                         "| awk 'BEGIN{OFS=\"\t\"} "\
                             "{if($2 ~ /^chr/) {$2 = substr($2, 4)}; "\
                              "if($2 == \"M\") {$2 = \"MT\"} print}' > %s.txt" \
                                  % (commonvar_fbase, commonvar_fbase))
            extract_cmd = [
                "hisat2_extract_snps_haplotypes_UCSC.py", "--inter-gap",
                str(inter_gap), "--intra-gap",
                str(intra_gap), "genome.fa",
                "%s.txt" % commonvar_fbase, commonvar_fbase
            ]
            if verbose:
                print("\tRunning:", ' '.join(extract_cmd), file=sys.stderr)
            proc = subprocess.Popen(extract_cmd,
                                    stdout=open("/dev/null", 'w'),
                                    stderr=open("/dev/null", 'w'))
            proc.communicate()
            if not typing_common.check_files(commonvar_fnames):
                print("Error: extract variants from clinvar failed!",
                      file=sys.stderr)
                sys.exit(1)

        # Read variants to be genotyped
        genotype_vars = typing_common.read_variants(commonvar_fnames[0])

        # Read haplotypes
        genotype_haplotypes = typing_common.read_haplotypes(
            commonvar_fnames[1])

    # Genes to be genotyped
    genotype_genes = {}

    # Read genes or genomics regions
    for database_name in database_list:
        # Extract HLA variants, backbone sequence, and other sequeces
        typing_common.extract_database_if_not_exists(
            database_name,
            [],  # locus_list
            inter_gap,
            intra_gap,
            True,  # partial?
            verbose)
        locus_fname = "%s.locus" % database_name
        assert os.path.exists(locus_fname)
        for line in open(locus_fname):
            locus_name, \
              chr, \
              left, \
              right, \
              length, \
              exon_str, \
              strand \
                   = line.strip().split()
            left = int(left)
            right = int(right)
            length = int(length)
            if chr not in chr_names:
                continue
            if chr not in genotype_genes:
                genotype_genes[chr] = []
            genotype_genes[chr].append([
                left, right, length, locus_name, database_name, exon_str,
                strand
            ])

    # Write genotype genome
    var_num = 0
    haplotype_num = 0
    genome_out_file = open("%s.fa" % base_fname, 'w')
    locus_out_file = open("%s.locus" % base_fname, 'w')
    var_out_file = open("%s.snp" % base_fname, 'w')
    index_var_out_file = open("%s.index.snp" % base_fname, 'w')
    haplotype_out_file = open("%s.haplotype" % base_fname, 'w')
    link_out_file = open("%s.link" % base_fname, 'w')
    coord_out_file = open("%s.coord" % base_fname, 'w')
    clnsig_out_file = open("%s.clnsig" % base_fname, 'w')
    for c in range(len(chr_names)):
        chr = chr_names[c]
        chr_full_name = chr_full_names[c]
        assert chr in chr_dic
        chr_seq = chr_dic[chr]
        chr_len = len(chr_seq)
        if chr in genotype_genes:
            chr_genes = genotype_genes[chr]
            chr_genes = sorted(chr_genes, key=lambda x: (x[1], x[2], x[3]))
        else:
            chr_genes = []

        chr_genotype_vars = []
        chr_genotype_vari = 0
        if graph_index:
            if chr in genotype_vars:
                chr_genotype_vars = genotype_vars[chr]
            chr_genotype_haplotypes = []
            chr_genotype_hti = 0
            if chr in genotype_haplotypes:
                chr_genotype_haplotypes = genotype_haplotypes[chr]

        def add_vars(left, right, chr_genotype_vari, chr_genotype_hti,
                     haplotype_num):
            # Output variants with clinical significance
            while chr_genotype_vari < len(chr_genotype_vars):
                var_left, \
                  var_type, \
                  var_data, \
                  var_id \
                    = chr_genotype_vars[chr_genotype_vari]
                var_right = var_left
                if var_type == "deletion":
                    var_right += var_data
                if var_right > right:
                    break
                if var_right >= left:
                    chr_genotype_vari += 1
                    continue

                out_str = "%s\t%s\t%s\t%d\t%s" % (var_id, var_type, chr,
                                                  var_left + off, var_data)
                print(out_str, file=var_out_file)
                print(out_str, file=index_var_out_file)

                if var_id in genotype_clnsig:
                    var_gene, clnsig = genotype_clnsig[var_id]
                    print("%s\t%s\t%s" \
                             % (var_id, var_gene, clnsig), file=clnsig_out_file)

                chr_genotype_vari += 1

            # Output haplotypes
            while chr_genotype_hti < len(chr_genotype_haplotypes):
                ht_left, ht_right, ht_vars = chr_genotype_haplotypes[
                    chr_genotype_hti]
                if ht_right > right:
                    break
                if ht_right >= left:
                    chr_genotype_hti += 1
                    continue

                print("ht%d\t%s\t%d\t%d\t%s" \
                        % (haplotype_num,
                           chr,
                           ht_left + off,
                           ht_right + off,
                           ','.join(ht_vars)),
                      file=haplotype_out_file)
                chr_genotype_hti += 1
                haplotype_num += 1

            return chr_genotype_vari, chr_genotype_hti, haplotype_num

        out_chr_seq = ""
        off = 0
        prev_right = 0
        for gene in chr_genes:
            left, right, length, name, family, exon_str, strand = gene

            if not graph_index:
                # Output gene (genotype_genome.gene)
                print("%s\t%s\t%s\t%d\t%d\t%s\t%s" \
                        % (family.upper(),
                           name,
                           chr,
                           left,
                           right,
                           exon_str,
                           strand),
                      file=locus_out_file)
                continue

            chr_genotype_vari, \
              chr_genotype_hti, \
              haplotype_num \
                = add_vars(left,
                           right,
                           chr_genotype_vari,
                           chr_genotype_hti,
                           haplotype_num)

            # Read gene family sequences and information
            allele_seqs = typing_common.read_allele_seq("%s_backbone.fa" %
                                                        family)
            allele_vars = typing_common.read_variants("%s.snp" % family)
            allele_index_vars = typing_common.read_variants("%s.index.snp" %
                                                            family)
            allele_haplotypes = typing_common.read_haplotypes("%s.haplotype" %
                                                              family)
            links = typing_common.read_links("%s.link" % family, True)

            if name not in allele_seqs:
                continue
            if name not in allele_vars or name not in allele_index_vars:
                vars = []
                index_vars = []
            else:
                vars = allele_vars[name]
                index_vars = allele_index_vars[name]

            allele_seq = allele_seqs[name]
            index_var_ids = set()
            for _, _, _, var_id in index_vars:
                index_var_ids.add(var_id)

            if name not in allele_haplotypes:
                haplotypes = []
            else:
                haplotypes = allele_haplotypes[name]
            assert length == len(allele_seq)
            assert left < chr_len and right < chr_len
            # Skipping overlapping genes
            if left < prev_right:
                print("Warning: skipping %s ..." % (name), file=sys.stderr)
                continue

            varID2htID = {}
            assert left < right
            prev_length = right - left + 1
            assert prev_length <= length

            if prev_right < left:
                out_chr_seq += chr_seq[prev_right:left]

            # Output gene (genotype_genome.locus)
            print("%s\t%s\t%s\t%d\t%d\t%s\t%s" \
                    % (family.upper(),
                       name,
                       chr,
                       len(out_chr_seq),
                       len(out_chr_seq) + length - 1,
                       exon_str,
                       strand),
                  file=locus_out_file)

            # Output coord (genotype_genome.coord)
            print("%s\t%d\t%d\t%d" \
                    % (chr,
                       len(out_chr_seq),
                       left,
                       right - left + 1),
                  file=coord_out_file)
            out_chr_seq += allele_seq

            # Output variants (genotype_genome.snp and genotype_genome.index.snp)
            for var in vars:
                var_left, var_type, var_data, var_id = var
                new_var_id = "hv%d" % var_num
                varID2htID[var_id] = new_var_id
                new_var_left = var_left + left + off
                assert var_type in ["single", "deletion", "insertion"]
                assert new_var_left < len(out_chr_seq)
                if var_type == "single":
                    assert out_chr_seq[new_var_left] != var_data
                elif var_type == "deletion":
                    assert new_var_left + var_data <= len(out_chr_seq)
                else:
                    assert var_type == "insertion"

                out_str = "%s\t%s\t%s\t%d\t%s" \
                            % (new_var_id, var_type, chr, new_var_left, var_data)
                print(out_str, file=var_out_file)
                if var_id in index_var_ids:
                    print(out_str, file=index_var_out_file)
                var_num += 1

            # Output haplotypes (genotype_genome.haplotype)
            for haplotype in haplotypes:
                ht_left, ht_right, ht_vars = haplotype
                new_ht_left = ht_left + left + off
                assert new_ht_left < len(out_chr_seq)
                new_ht_right = ht_right + left + off
                assert new_ht_left <= new_ht_right
                assert new_ht_right <= len(out_chr_seq)
                new_ht_vars = []
                for var_id in ht_vars:
                    assert var_id in varID2htID
                    new_ht_vars.append(varID2htID[var_id])
                print("ht%d\t%s\t%d\t%d\t%s" \
                        % (haplotype_num,
                           chr,
                           new_ht_left,
                           new_ht_right,
                           ','.join(new_ht_vars)),
                      file=haplotype_out_file)
                haplotype_num += 1

            # Output link information between alleles and variants (genotype_genome.link)
            for link in links:
                var_id, allele_names = link
                if var_id not in varID2htID:
                    continue
                new_var_id = varID2htID[var_id]
                print("%s\t%s" % (new_var_id, " ".join(allele_names)),
                      file=link_out_file)

            off += (length - prev_length)
            prev_right = right + 1

        if not graph_index:
            continue

        # Write the rest of the Vars
        chr_genotype_vari, \
          chr_genotype_hti, \
          haplotype_num \
            = add_vars(sys.maxsize,
                       sys.maxsize,
                       chr_genotype_vari,
                       chr_genotype_hti,
                       haplotype_num)

        print("%s\t%d\t%d\t%d" \
                % (chr,
                   len(out_chr_seq),
                   prev_right,
                   len(chr_seq) - prev_right),
              file=coord_out_file)
        out_chr_seq += chr_seq[prev_right:]

        assert len(out_chr_seq) == len(chr_seq) + off

        # Output chromosome sequence
        print(">%s" % (chr_full_name), file=genome_out_file)
        line_width = 60
        for s in range(0, len(out_chr_seq), line_width):
            print(out_chr_seq[s:s + line_width], file=genome_out_file)

    genome_out_file.close()
    locus_out_file.close()
    var_out_file.close()
    index_var_out_file.close()
    haplotype_out_file.close()
    link_out_file.close()
    coord_out_file.close()
    clnsig_out_file.close()

    allele_out_file = open("%s.allele" % base_fname, 'w')
    if graph_index:
        for database in database_list:
            for line in open("%s.allele" % database):
                allele_name = line.strip()
                print("%s\t%s" % (database.upper(), allele_name),
                      file=allele_out_file)
    allele_out_file.close()

    partial_out_file = open("%s.partial" % base_fname, 'w')
    if graph_index:
        for database in database_list:
            for line in open("%s.partial" % database):
                allele_name = line.strip()
                print("%s\t%s" % (database.upper(), allele_name),
                      file=partial_out_file)
    partial_out_file.close()

    if not graph_index:
        shutil.copyfile("genome.fa", "%s.fa" % base_fname)

    # Index genotype_genome.fa
    index_cmd = ["samtools", "faidx", "%s.fa" % base_fname]
    subprocess.call(index_cmd)

    # Build indexes based on the above information
    if graph_index:
        assert aligner == "hisat2"
        build_cmd = [
            "hisat2-build", "-p",
            str(threads), "--snp",
            "%s.index.snp" % base_fname, "--haplotype",
            "%s.haplotype" % base_fname,
            "%s.fa" % base_fname,
            "%s" % base_fname
        ]
    else:
        assert aligner in ["hisat2", "bowtie2"]
        build_cmd = [
            "%s-build" % aligner, "-p" if aligner == "hisat2" else "--threads",
            str(threads),
            "%s.fa" % base_fname,
            "%s" % base_fname
        ]
    if verbose:
        print("\tRunning:", ' '.join(build_cmd), file=sys.stderr)

    subprocess.call(build_cmd,
                    stdout=open("/dev/null", 'w'),
                    stderr=open("/dev/null", 'w'))

    if aligner == "hisat2":
        index_fnames = ["%s.%d.ht2" % (base_fname, i + 1) for i in range(8)]
    else:
        index_fnames = ["%s.%d.bt2" % (base_fname, i + 1) for i in range(4)]
        index_fnames += [
            "%s.rev.%d.bt2" % (base_fname, i + 1) for i in range(2)
        ]
    if not typing_common.check_files(index_fnames):
        print("Error: indexing failed! "\
               "Perhaps, you may have forgotten to build %s executables?" \
                    % aligner,
              file=sys.stderr)
        sys.exit(1)
def genotyping(read_dir, reference_type, region_list, num_editdist, nthreads,
               max_sample, assembly, out_dir, verbose, platinum_check):
    for database_name in region_list:
        # Extract variants, backbone sequence, and other sequeces
        typing_common.extract_database_if_not_exists(database_name,
                                                     [])  # locus_list
        # Build HISAT2's graph index
        typing_common.build_index_if_not_exists(
            database_name,
            "hisat2",
            "graph",
            1,  # threads
            verbose)

    if not os.path.exists(read_dir):
        print("Error: %s does not exist." % read_dir, file=sys.stderr)
        sys.exit(1)

    if out_dir != "" and not os.path.exists(out_dir):
        os.mkdir(out_dir)

    # fastq files
    fq_fnames = glob.glob("%s/*.extracted.1.fq.gz" % read_dir)

    genotype_results = []

    lock = threading.Lock()
    threads = []
    for t in range(nthreads):
        thread = myThread(lock, fq_fnames, reference_type, region_list,
                          num_editdist, max_sample, assembly, out_dir,
                          genotype_results, verbose)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    if platinum_check:
        genotype_dic = {}
        for genome, allele, abundance in genotype_results:
            region, _ = allele.split('*')
            if region not in genotype_dic:
                genotype_dic[region] = {}
            if genome not in genotype_dic[region]:
                genotype_dic[region][genome] = []
            if len(genotype_dic[region][genome]) >= 2:
                continue
            # DK - debugging purposes
            # if abundance < 0.15 * 100:
            #    continue
            genotype_dic[region][genome].append([allele, abundance])

        for region, region_genotype in genotype_dic.items():
            print(region, file=sys.stderr)
            included = 0
            total = 0
            for genome, genome_alleles in region_genotype.items():
                genome_alleles = set([allele for allele, _ in genome_alleles])
                if "father" in CEPH_pedigree[genome]:
                    assert "mother" in CEPH_pedigree[genome]
                    parents = [
                        CEPH_pedigree[genome]["father"],
                        CEPH_pedigree[genome]["mother"]
                    ]
                else:
                    parents = []
                parent_allele_sets = []
                assert len(parents) in [0, 2]
                if len(parents) == 2 \
                        and parents[0] in region_genotype \
                        and parents[1] in region_genotype:
                    for parent_allele, _ in region_genotype[parents[0]]:
                        for parent_allele2, _ in region_genotype[parents[1]]:
                            parent_allele_sets.append(
                                set([parent_allele, parent_allele2]))
                print(("\t", genome, genome_alleles, parent_allele_sets),
                      file=sys.stderr)
                if len(parent_allele_sets) > 0:
                    total += 1
                    if genome_alleles in parent_allele_sets:
                        included += 1
            print("\t%d / %d" % (included, total), file=sys.stderr)
Example #4
0
def build_genotype_genome(base_fname,                          
                          inter_gap,
                          intra_gap,
                          threads,
                          database_list,
                          use_clinvar,
                          use_commonvar,
                          verbose):    
    # Download HISAT2 index
    HISAT2_fnames = ["grch38",
                     "genome.fa",
                     "genome.fa.fai"]
    if not typing_common.check_files(HISAT2_fnames):
        typing_common.download_genome_and_index()

    # Load genomic sequences
    chr_dic, chr_names, chr_full_names = typing_common.read_genome(open("genome.fa"))

    genotype_vars, genotype_haplotypes, genotype_clnsig = {}, {}, {}
    if use_clinvar:
        # Extract variants from the ClinVar database
        CLINVAR_fnames = ["clinvar.vcf.gz",
                          "clinvar.snp",
                          "clinvar.haplotype",
                          "clinvar.clnsig"]

        if not typing_common.check_files(CLINVAR_fnames):
            if not os.path.exists("clinvar.vcf.gz"):
                os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive/2017/clinvar_20170404.vcf.gz")
            assert os.path.exists("clinvar.vcf.gz")

            extract_cmd = ["hisat2_extract_snps_haplotypes_VCF.py"]
            extract_cmd += ["--inter-gap", str(inter_gap),
                            "--intra-gap", str(intra_gap),
                            "--genotype-vcf", "clinvar.vcf.gz",
                            "genome.fa", "/dev/null", "clinvar"]
            if verbose:
                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
            proc.communicate()
            if not typing_common.check_files(CLINVAR_fnames):
                print >> sys.stderr, "Error: extract variants from clinvar failed!"
                sys.exit(1)

        # Read variants to be genotyped
        genotype_vars = typing_common.read_variants("clinvar.snp")

        # Read haplotypes
        genotype_haplotypes = typing_common.read_haplotypes("clinvar.haplotype")

        # Read information about clinical significance
        genotype_clnsig = typing_common.read_clnsig("clinvar.clnsig")

    if use_commonvar:
        # Extract variants from dbSNP database
        commonvar_fbase = "snp144Common"
        commonvar_fnames = ["%s.snp" % commonvar_fbase,
                            "%s.haplotype" % commonvar_fbase]
        if not typing_common.check_files(commonvar_fnames):
            if not os.path.exists("%s.txt.gz" % commonvar_fbase):
                os.system("wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/%s.txt.gz" % commonvar_fbase)
            assert os.path.exists("%s.txt.gz" % commonvar_fbase)
            os.system("gzip -cd %s.txt.gz | awk 'BEGIN{OFS=\"\t\"} {if($2 ~ /^chr/) {$2 = substr($2, 4)}; if($2 == \"M\") {$2 = \"MT\"} print}' > %s.txt" % (commonvar_fbase, commonvar_fbase))
            extract_cmd = ["hisat2_extract_snps_haplotypes_UCSC.py",
                           "--inter-gap", str(inter_gap),
                           "--intra-gap", str(intra_gap),
                           "genome.fa", "%s.txt" % commonvar_fbase, commonvar_fbase]
            if verbose:
                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
            proc.communicate()
            if not typing_common.check_files(commonvar_fnames):
                print >> sys.stderr, "Error: extract variants from clinvar failed!"
                sys.exit(1)

        # Read variants to be genotyped
        genotype_vars = typing_common.read_variants("%s.snp" % commonvar_fbase)

        # Read haplotypes
        genotype_haplotypes = typing_common.read_haplotypes("%s.haplotype" % commonvar_fbase)

    # Genes to be genotyped
    genotype_genes = {}

    # Read genes or genomics regions
    for database_name in database_list:
        # Extract HLA variants, backbone sequence, and other sequeces
        typing_common.extract_database_if_not_exists(database_name,
                                                     [],            # locus_list
                                                     inter_gap,
                                                     intra_gap,
                                                     True,          # partial?
                                                     verbose)
        locus_fname = "%s.locus" % database_name
        assert os.path.exists(locus_fname)
        for line in open(locus_fname):
            HLA_name, chr, left, right, length, exon_str, strand = line.strip().split()
            left, right = int(left), int(right)
            length = int(length)
            if chr not in chr_names:
                continue
            if chr not in genotype_genes:
                genotype_genes[chr] = []
            genotype_genes[chr].append([left, right, length, HLA_name, database_name, exon_str, strand])

    # Write genotype genome
    var_num, haplotype_num = 0, 0
    genome_out_file = open("%s.fa" % base_fname, 'w')
    locus_out_file = open("%s.locus" % base_fname, 'w')
    var_out_file = open("%s.snp" % base_fname, 'w')
    index_var_out_file = open("%s.index.snp" % base_fname, 'w')
    haplotype_out_file = open("%s.haplotype" % base_fname, 'w')
    link_out_file = open("%s.link" % base_fname, 'w')
    coord_out_file = open("%s.coord" % base_fname, 'w')
    clnsig_out_file = open("%s.clnsig" % base_fname, 'w')
    for c in range(len(chr_names)):
        chr = chr_names[c]
        chr_full_name = chr_full_names[c]
        assert chr in chr_dic
        chr_seq = chr_dic[chr]
        chr_len = len(chr_seq)
        if chr in genotype_genes:
            chr_genes = genotype_genes[chr]
            def gene_cmp(a, b):
                a_left, a_right, a_length = a[:3]
                b_left, b_right, b_length = b[:3]
                if a_left != b_left:
                    return a_left - b_left
                if a_right != b_right:
                    return a_right - b_right
                return a_lenght - b_length
            chr_genes = sorted(chr_genes, cmp=gene_cmp)
        else:
            chr_genes = []

        chr_genotype_vars, chr_genotype_vari = [], 0
        if chr in genotype_vars:
            chr_genotype_vars = genotype_vars[chr]
        chr_genotype_haplotypes, chr_genotype_hti = [], 0
        if chr in genotype_haplotypes:
            chr_genotype_haplotypes = genotype_haplotypes[chr]

        def add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num):
            # Output variants with clinical significance
            while chr_genotype_vari < len(chr_genotype_vars):
                var_left, var_type, var_data, var_id =  chr_genotype_vars[chr_genotype_vari]
                var_right = var_left
                if var_type == "deletion":
                    var_right += var_data
                if var_right > right:
                    break
                if var_right >= left:
                    chr_genotype_vari += 1
                    continue

                out_str = "%s\t%s\t%s\t%d\t%s" % (var_id, var_type, chr, var_left + off, var_data)
                print >> var_out_file, out_str
                print >> index_var_out_file, out_str

                if var_id in genotype_clnsig:
                    var_gene, clnsig = genotype_clnsig[var_id]
                    print >> clnsig_out_file, "%s\t%s\t%s" % \
                        (var_id, var_gene, clnsig)
                
                chr_genotype_vari += 1

            # Output haplotypes
            while chr_genotype_hti < len(chr_genotype_haplotypes):
                ht_left, ht_right, ht_vars =  chr_genotype_haplotypes[chr_genotype_hti]
                if ht_right > right:
                    break
                if ht_right >= left:
                    chr_genotype_hti += 1
                    continue

                print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \
                    (haplotype_num, chr, ht_left + off, ht_right + off, ','.join(ht_vars))
                chr_genotype_hti += 1
                haplotype_num += 1

            return chr_genotype_vari, chr_genotype_hti, haplotype_num

        out_chr_seq = ""
        
        off = 0
        prev_right = 0
        for gene in chr_genes:
            left, right, length, name, family, exon_str, strand = gene

            chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num)

            # Read HLA backbone sequences
            allele_seqs = typing_common.read_allele_sequences("%s_backbone.fa" % family)

            # Read HLA variants
            allele_vars = typing_common.read_variants("%s.snp" % family)
            allele_index_vars = typing_common.read_variants("%s.index.snp" % family)
                
            # Read HLA haplotypes
            allele_haplotypes = typing_common.read_haplotypes("%s.haplotype" % family)

            # Read HLA link information between haplotypes and variants
            links = typing_common.read_links("%s.link" % family)

            if name not in allele_seqs or \
                    name not in allele_vars or \
                    name not in allele_haplotypes:
                continue
            allele_seq = allele_seqs[name]
            vars, index_vars = allele_vars[name], allele_index_vars[name]
            index_var_ids = set()
            for _, _, _, var_id in index_vars:
                index_var_ids.add(var_id)

            haplotypes = allele_haplotypes[name]
            assert length == len(allele_seq)
            assert left < chr_len and right < chr_len
            # Skipping overlapping genes
            if left < prev_right:
                print >> sys.stderr, "Warning: skipping %s ..." % (name)
                continue

            varID2htID = {}

            assert left < right
            prev_length = right - left + 1
            assert prev_length <= length

            if prev_right < left:
                out_chr_seq += chr_seq[prev_right:left]

            # Output gene (genotype_genome.gene)
            print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \
                (family.upper(), name, chr, len(out_chr_seq), len(out_chr_seq) + length - 1, exon_str, strand)

            # Output coord (genotype_genome.coord)
            print >> coord_out_file, "%s\t%d\t%d\t%d" % \
                (chr, len(out_chr_seq), left, right - left + 1)
            out_chr_seq += allele_seq

            # Output variants (genotype_genome.snp and genotype_genome.index.snp)
            for var in vars:
                var_left, var_type, var_data, var_id = var
                new_var_id = "hv%d" % var_num
                varID2htID[var_id] = new_var_id
                new_var_left = var_left + left + off
                assert var_type in ["single", "deletion", "insertion"]
                assert new_var_left < len(out_chr_seq)
                if var_type == "single":                    
                    assert out_chr_seq[new_var_left] != var_data
                elif var_type == "deletion":
                    assert new_var_left + var_data <= len(out_chr_seq)
                else:
                    assert var_type == "insertion"

                out_str = "%s\t%s\t%s\t%d\t%s" % (new_var_id, var_type, chr, new_var_left, var_data)
                print >> var_out_file, out_str
                if var_id in index_var_ids:
                    print >> index_var_out_file, out_str
                var_num += 1
                
            # Output haplotypes (genotype_genome.haplotype)
            for haplotype in haplotypes:
                ht_left, ht_right, ht_vars = haplotype
                new_ht_left = ht_left + left + off
                assert new_ht_left < len(out_chr_seq)
                new_ht_right = ht_right + left + off
                assert new_ht_left <= new_ht_right
                assert new_ht_right <= len(out_chr_seq)
                new_ht_vars = []
                for var_id in ht_vars:
                    assert var_id in varID2htID
                    new_ht_vars.append(varID2htID[var_id])
                print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \
                    (haplotype_num, chr, new_ht_left, new_ht_right, ','.join(new_ht_vars))
                haplotype_num += 1

            # Output link information between alleles and variants (genotype_genome.link)
            for link in links:
                var_id, allele_names = link
                if var_id not in varID2htID:
                    continue
                new_var_id = varID2htID[var_id]
                print >> link_out_file, "%s\t%s" % (new_var_id, allele_names)
                
            off += (length - prev_length)

            prev_right = right + 1

        # Write the rest of the Vars
        chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(sys.maxint, sys.maxint, chr_genotype_vari, chr_genotype_hti, haplotype_num)            
            
        print >> coord_out_file, "%s\t%d\t%d\t%d" % \
            (chr, len(out_chr_seq), prev_right, len(chr_seq) - prev_right)
        out_chr_seq += chr_seq[prev_right:]

        assert len(out_chr_seq) == len(chr_seq) + off

        # Output chromosome sequence
        print >> genome_out_file, ">%s" % (chr_full_name)
        line_width = 60
        for s in range(0, len(out_chr_seq), line_width):
            print >> genome_out_file, out_chr_seq[s:s+line_width]

    genome_out_file.close()
    locus_out_file.close()
    var_out_file.close()
    index_var_out_file.close()
    haplotype_out_file.close()
    link_out_file.close()
    coord_out_file.close()
    clnsig_out_file.close()

    partial_out_file = open("%s.partial" % base_fname, 'w')
    for database in database_list:
        for line in open("%s.partial" % database):
            allele_name = line.strip()
            print >> partial_out_file, "%s\t%s" % (database.upper(), allele_name)
    partial_out_file.close()

    # Index genotype_genome.fa
    index_cmd = ["samtools", "faidx", "%s.fa" % base_fname]
    subprocess.call(index_cmd)

    # Build HISAT-genotype graph indexes based on the above information
    hisat2_index_fnames = ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
    build_cmd = ["hisat2-build",
                 "-p", str(threads),
                 "--snp", "%s.index.snp" % base_fname,
                 "--haplotype", "%s.haplotype" % base_fname,
                 "%s.fa" % base_fname,
                 "%s" % base_fname]
    if verbose:
        print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
        
    subprocess.call(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
    if not typing_common.check_files(hisat2_index_fnames):
        print >> sys.stderr, "Error: indexing failed!  Perhaps, you may have forgotten to build hisat2 executables?"
        sys.exit(1)