Ejemplo n.º 1
0
def main():
    vntr_map = {}
    if hg38:
        reference_vntrs = load_unique_vntrs_data(
            'vntr_data/hg38_selected_VNTRs_Illumina.db')
        vntr_ids = []
        for ref_vntr in reference_vntrs:
            vntr_map[ref_vntr.id] = ref_vntr
            if 100 >= len(ref_vntr.pattern) >= 6:
                vntr_ids.append(ref_vntr.id)
    else:
        reference_vntrs = load_unique_vntrs_data()
        for ref_vntr in reference_vntrs:
            vntr_map[ref_vntr.id] = ref_vntr

        from advntr.advntr_commands import get_tested_vntrs
        vntr_ids = get_tested_vntrs()

    print('len of reference_vntrs:', len(reference_vntrs))
    print('# of vntrs: %s' % len(vntr_ids))

    start, end = int(sys.argv[2]), int(sys.argv[3])

    # run_simulation(vntr_map, 503431)
    # exit(0)

    count = 0
    for vid in vntr_ids:
        count += 1
        if count < start or count > end:
            continue
        run_simulation(vntr_map, vid)
Ejemplo n.º 2
0
def create_illumina_genotyping_references(illumina_read_dir='../Genotyping/'):
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA'}
    repeats = {'GP1BA': range(1, 5), 'CSTB': range(1, 16), 'MAOA': range(1, 6)}
    repeats_patterns = {
        'GP1BA': [
            'AGCCCGACCACCCCAGAGCCCACCTCAGAGCCCGCCCCC',
            'AGCCCGACCACCCCGGAGCCCACCTCAGAGCCCGCCCCC',
            'AGCCCGACCACCCCGGAGCCCACCCCAATCCCGACCATCGCCA'
        ],
        'CSTB':
        ['CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGGCGGGCGGGG'],
        'MAOA': [
            'ACCGGCACCGGCACCAGTACCCGCACCAGT', 'ACCGGCACCGGCACCGAGCGCAAGGCGGAG',
            'ACCGGCACCGGCACCAGTACCCGCACCAGT'
        ]
    }

    for vntr_id in id_to_gene.keys():
        # if vntr_id != 1221:
        #     continue
        for repeat in repeats[id_to_gene[vntr_id]]:
            outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.pacfa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 3000,
                repeats_patterns[id_to_gene[vntr_id]])
Ejemplo n.º 3
0
def generate_pairwise_aln(log_file,
                          aln_file,
                          ref_vntr_db=None,
                          vntr_ids=None,
                          sort_by_repeat=True):
    """
    Generate pairwise alignment for each spanning reads
    :param log_file: a log file or a directory
    :param aln_file: output file name
    :param ref_vntr_db: reference VNTR database
    :param vntr_ids: VNTR id list that you want to generate alignment
    :param sort_by_repeat: if True, the reads will be sorted by the number of repeats
    """
    # Load reference VNTRs
    reference_vntrs = load_unique_vntrs_data(ref_vntr_db)
    ref_vntrs = {ref_vntr.id: ref_vntr for ref_vntr in reference_vntrs}

    if os.path.isdir(log_file):
        log_files = glob.glob(log_file + "/log_*.log")
        for lf in log_files:
            if aln_file is not None:
                print(
                    "ERROR: If log file is given as a directory, output name should be None"
                )
                exit(-1)
            if aln_file is None:
                out_file = lf.split("/")[-1].split(".")[0] + ".aln"
            _generate_pairwise_aln(lf, out_file, ref_vntrs, vntr_ids,
                                   sort_by_repeat)
    else:
        if aln_file is None:
            out_file = log_file.split("/")[-1].split(".")[0] + ".aln"
        _generate_pairwise_aln(log_file, out_file, ref_vntrs, vntr_ids,
                               sort_by_repeat)
Ejemplo n.º 4
0
def get_flakning_region_error_rate(log_file, out_file, ref_vntr_db, vntr_ids):
    # Load reference VNTRs
    reference_vntrs = load_unique_vntrs_data(ref_vntr_db)
    ref_vntrs = {ref_vntr.id: ref_vntr for ref_vntr in reference_vntrs}

    total_vid_repeat_flanking_errcount = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))
    total_vid_repeat_flanking_bpcount = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))

    if os.path.isdir(log_file):
        log_files = glob.glob(log_file + "/log_*.log")
        for lf in log_files:
            errcount_dict, bpcount_dict = _get_flanking_region_error_rate(
                lf, ref_vntrs, vntr_ids)
            for vid in bpcount_dict.keys():
                for repeat_count in bpcount_dict[vid].keys():
                    for flanking in bpcount_dict[vid][repeat_count].keys():
                        total_vid_repeat_flanking_errcount[vid][repeat_count][
                            flanking] += errcount_dict[vid][repeat_count][
                                flanking]
                        total_vid_repeat_flanking_bpcount[vid][repeat_count][
                            flanking] += bpcount_dict[vid][repeat_count][
                                flanking]
    else:
        total_vid_repeat_flanking_errcount, total_vid_repeat_flanking_bpcount = _get_flanking_region_error_rate(
            log_file, ref_vntrs, vntr_ids)

    with open(out_file, "w") as of:
        for vid in total_vid_repeat_flanking_bpcount.keys():
            of.write("VID:{} ".format(vid)),
            of.write("REFRC:{} ".format(ref_vntrs[vid].estimated_repeats))
            for repeat_count in sorted(
                    total_vid_repeat_flanking_bpcount[vid].keys()):
                of.write("{}:".format(repeat_count))
                of.write("{:.2f}/{:.2f} ".format(
                    1 -
                    float(total_vid_repeat_flanking_errcount[vid][repeat_count]
                          ['left']) / total_vid_repeat_flanking_bpcount[vid]
                    [repeat_count]['left'], 1 -
                    float(total_vid_repeat_flanking_errcount[vid][repeat_count]
                          ['right']) / total_vid_repeat_flanking_bpcount[vid]
                    [repeat_count]['right']))
            of.write("\n")
Ejemplo n.º 5
0
def create_pacbio_copy_number_variation_references(
        pacbio_read_dir='../pacbio_recruitment/set1/'):
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'}
    repeats = {
        'CSTB': range(1, 69),
        'HIC1': range(2, 36),
        'INS': range(10, 171)
    }

    for vntr_id in id_to_gene.keys():
        for repeat in repeats[id_to_gene[vntr_id]]:
            if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0:
                continue
            if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0:
                continue
            outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.fa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 1000)
Ejemplo n.º 6
0
def view_model(args, viewmodel_parser):
    valid_characters = {'A', 'C', 'G', 'T'}
    if args.pattern:
        for element in set(args.pattern.upper()):
            if element not in valid_characters:
                print_error(viewmodel_parser,
                            'Pattern should only contain A, C, G, T')

    genes = [gene.upper() for gene in args.gene.split(',') if gene]
    reference_vntrs = load_unique_vntrs_data()
    results = []
    for ref_vntr in reference_vntrs:
        if len(genes) and ref_vntr.gene_name not in genes:
            continue
        if args.pattern and ref_vntr.pattern != args.pattern.upper():
            continue
        # if ref_vntr.get_length() > 130:
        #     continue
        results.append(ref_vntr)
    print_models(results)
Ejemplo n.º 7
0
def create_pacbio_coverage_data_for_3_genes_and_10_cn(
        pacbio_read_dir='../pacbio_coverage_experiment/'):
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'}
    repeats = {
        'CSTB': range(2, 42),
        'HIC1': range(2, 22),
        'INS': range(10, 110)
    }

    for vntr_id in id_to_gene.keys():
        for repeat in repeats[id_to_gene[vntr_id]]:
            if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0:
                continue
            if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0:
                continue
            if id_to_gene[vntr_id] != 'INS':
                continue
            outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.fa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 3000)
Ejemplo n.º 8
0
def create_pacbio_ru_length_data_for_all_vntrs(
        pacbio_read_dir='../pacbio_ru_data_for_all_vntrs/'):
    reference_vntrs = load_unique_vntrs_data()

    with open('vntr_complex.txt') as infile:
        lines = infile.readlines()
        complex_vntrs = [int(r.strip().split()[0]) for r in lines] + [0]

    repeat_units = {}
    for vntr_id in range(len(reference_vntrs)):
        if vntr_id in complex_vntrs:
            continue
        ru = len(reference_vntrs[vntr_id].pattern)
        if ru not in repeat_units.keys():
            repeat_units[ru] = []
        if len(repeat_units[ru]) >= 4:
            continue
        repeat_units[ru].append(vntr_id)

    import os
    for ru in repeat_units.keys():
        if len(repeat_units[ru]) < 2:
            continue
        for vntr_id in repeat_units[ru]:
            original_repeats = len(
                reference_vntrs[vntr_id].get_repeat_segments())
            start = max(3, original_repeats - 10)
            for repeat in range(start, start + 21):
                if repeat % 5 != 0:
                    continue
                outfile = pacbio_read_dir + str(ru) + '/vntr_id_' + str(
                    vntr_id) + '_' + str(repeat) + '.fa'
                if not os.path.exists(os.path.dirname(outfile)):
                    os.makedirs(os.path.dirname(outfile))
                create_reference_region_with_specific_repeats(
                    reference_vntrs[vntr_id], repeat, outfile, 1000)
Ejemplo n.º 9
0
def create_illumina_copy_number_variation_references(
        illumina_read_dir='../Illumina_copy_number/'):
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {
        119: 'DRD4',
        1220: 'GP1BA',
        1221: 'CSTB',
        1214: 'MAOA',
        1219: 'IL1RN'
    }
    repeats = {
        'DRD4': range(1, 12),
        'GP1BA': range(1, 6),
        'CSTB': range(1, 16),
        'MAOA': range(1, 6),
        'IL1RN': range(1, 10)
    }

    for vntr_id in id_to_gene.keys():
        for repeat in repeats[id_to_gene[vntr_id]]:
            outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.fa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 149)
Ejemplo n.º 10
0
def genotype(args, genotype_parser):
    if args.alignment_file is None and args.fasta is None:
        print_error(
            genotype_parser,
            'No input specified. Please specify alignment file or fasta file')

    if args.nanopore:
        settings.MAX_ERROR_RATE = 0.3
    elif args.pacbio:
        settings.MAX_ERROR_RATE = 0.3
    else:
        settings.MAX_ERROR_RATE = 0.05

    if args.threads < 1:
        print_error(genotype_parser, 'threads cannot be less than 1')
    settings.CORES = args.threads

    if args.expansion and args.coverage is None:
        print_error(
            genotype_parser,
            'Please specify the average coverage to identify the expansion')
    average_coverage = args.coverage if args.expansion else None

    input_file = args.alignment_file if args.alignment_file else args.fasta
    input_is_alignment_file = input_file.endswith(
        'bam') or input_file.endswith('sam') or input_file.endswith('cram')
    if not input_is_alignment_file:
        print_error(
            genotype_parser,
            "The input file format is not supported. Please use BAM/CRAM files."
        )
    if args.working_directory is None:
        print_error(
            genotype_parser,
            'Please specify working directory by -wd or --working_directory')
    working_directory = args.working_directory + '/' if args.working_directory else os.path.dirname(
        input_file) + '/'

    log_file = working_directory + 'log_%s.log' % os.path.basename(input_file)
    log_format = '%(asctime)s %(levelname)s:%(message)s'
    logging.basicConfig(format=log_format,
                        filename=log_file,
                        level=logging.DEBUG,
                        filemode='w')

    if args.outfile:
        sys.stdout = open(args.outfile, 'w')

    models_file = args.models
    if models_file is None:
        models_file = settings.ILLUMINA_DEFAULT_MODELS_FILE
        if args.pacbio:
            models_file = settings.PACBIO_DEFAULT_MODELS_FILE
    settings.TRAINED_MODELS_DB = models_file
    settings.TRAINED_HMMS_DIR = os.path.dirname(
        os.path.realpath(settings.TRAINED_MODELS_DB)) + '/'

    reference_vntrs = load_unique_vntrs_data()
    target_vntrs = [ref_vntr.id for ref_vntr in reference_vntrs]
    if args.vntr_id is not None:
        target_vntrs = [int(vid) for vid in args.vntr_id.split(',')]
    logging.info('Running adVNTR for %s VNTRs' % len(target_vntrs))
    genome_analyzier = GenomeAnalyzer(reference_vntrs, target_vntrs,
                                      working_directory, args.outfmt,
                                      args.haploid, args.reference_filename,
                                      input_file)
    if args.pacbio:
        if input_is_alignment_file:
            genome_analyzier.find_repeat_counts_from_pacbio_alignment_file(
                input_file)
        else:
            genome_analyzier.find_repeat_counts_from_pacbio_reads(
                input_file, args.naive)
    else:
        if args.frameshift:
            if valid_vntr_for_frameshift(target_vntrs):
                genome_analyzier.find_frameshift_from_alignment_file(
                    input_file)
            else:
                print_error(genotype_parser,
                            '--frameshift is not available for these VNTRs')
        elif input_is_alignment_file:
            genome_analyzier.find_repeat_counts_from_alignment_file(
                input_file, average_coverage, args.update)
        else:
            genome_analyzier.find_repeat_counts_from_short_reads(input_file)
Ejemplo n.º 11
0
    for rpkm in rpkms:
        thresholds[rpkm.split('.')[0]] = 0.0005

run_permutation_test = False
bootstrapping = False

highest_fs = 0
lowest_p = 1e10

try:
    gene_locations_obj = GeneLocations()
except:
    pass

if __name__ == '__main__':
    ref_vntrs = load_unique_vntrs_data(vntr_models_dir)
    reference_vntrs = {}
    for ref_vntr in ref_vntrs:
        reference_vntrs[ref_vntr.id] = ref_vntr


def get_average(lst):
    return sum(lst) / len(lst)


def get_wgs_id_to_individual_id_map():
    try:
        with open(wgs_id_gtex_id_file) as infile:
            lines = infile.readlines()
    except:
        lines = []
Ejemplo n.º 12
0
def genotype(args, genotype_parser):
    if args.alignment_file is None and args.fasta is None:
        print_error(
            genotype_parser,
            'No input specified. Please specify alignment file or fasta file')

    if args.nanopore:
        settings.MAX_ERROR_RATE = 0.3
    elif args.pacbio:
        settings.MAX_ERROR_RATE = 0.3
    else:
        settings.MAX_ERROR_RATE = 0.05

    if args.threads < 1:
        print_error(genotype_parser, 'threads cannot be less than 1')
    settings.CORES = args.threads

    if args.expansion and args.coverage is None:
        print_error(
            genotype_parser,
            'Please specify the average coverage to identify the expansion')
    average_coverage = args.coverage if args.expansion else None

    input_file = args.alignment_file if args.alignment_file else args.fasta
    input_is_alignment_file = input_file.endswith(
        'bam') or input_file.endswith('sam')
    working_directory = args.working_directory + '/' if args.working_directory else os.path.dirname(
        input_file) + '/'

    settings.BLAST_TMP_DIR = working_directory + settings.BLAST_TMP_RELATIVE_DIR
    log_file = working_directory + 'log_%s.log' % os.path.basename(input_file)
    log_format = '%(asctime)s %(levelname)s:%(message)s'
    logging.basicConfig(format=log_format,
                        filename=log_file,
                        level=logging.DEBUG,
                        filemode='w')

    settings.TRAINED_MODELS_DB = args.models
    settings.TRAINED_HMMS_DIR = os.path.dirname(
        os.path.realpath(settings.TRAINED_MODELS_DB)) + '/'
    reference_vntrs = load_unique_vntrs_data()
    # reference_vntrs = identify_homologous_vntrs(reference_vntrs, 'chr15')
    illumina_targets = [532789, 188871, 301645, 600000]

    target_vntrs = []
    for i in range(len(reference_vntrs)):
        if not reference_vntrs[i].is_non_overlapping(
        ) or reference_vntrs[i].has_homologous_vntr():
            continue
        target_vntrs.append(reference_vntrs[i].id)

    if args.vntr_id is not None:
        target_vntrs = [int(vid) for vid in args.vntr_id.split(',')]
    else:
        target_vntrs = illumina_targets
    genome_analyzier = GenomeAnalyzer(reference_vntrs,
                                      target_vntrs,
                                      working_directory,
                                      is_haploid=args.haploid)
    if args.pacbio:
        if input_is_alignment_file:
            genome_analyzier.find_repeat_counts_from_pacbio_alignment_file(
                input_file)
        else:
            genome_analyzier.find_repeat_counts_from_pacbio_reads(
                input_file, args.naive)
    else:
        if args.frameshift:
            if valid_vntr_for_frameshift(target_vntrs):
                genome_analyzier.find_frameshift_from_alignment_file(
                    input_file)
            else:
                print_error(genotype_parser,
                            '--frameshift is not available for these VNTRs')
        elif input_is_alignment_file:
            genome_analyzier.find_repeat_counts_from_alignment_file(
                input_file, average_coverage, args.update)
        else:
            genome_analyzier.find_repeat_counts_from_short_reads(input_file)
Ejemplo n.º 13
0
# Output: Accuracy for each heterozygous scenario.

from advntr import models
from collections import defaultdict

advntr_db = '/home/jonghun/advntr_vcf/adVNTR/vntr_data/hg38_selected_VNTRs_Illumina.db'

####################################################################
# Get similar VNTR IDs for filtering
similar_vntrs_hg38 = set()
with open("similar_vntrs_hg38_maxlen1000.txt", "r") as f:
    for line in f:
        similar_vntrs_hg38.add(int(line))

# Read genotype results for those and the target VNTRs
reference_vntrs = models.load_unique_vntrs_data(advntr_db)
ref_vntrs = {ref_vntr.id: ref_vntr for ref_vntr in reference_vntrs}
target_vntrs = [
    ref_vntr for ref_vntr in reference_vntrs if
    6 <= len(ref_vntr.pattern) <= 20 and ref_vntr.id not in similar_vntrs_hg38
]
target_vids = set([ref_vntr.id for ref_vntr in target_vntrs])

####################################################################
# Calculate accuracy for each secnario
print(
    "This script calculates accuracies for each scenarios for adVNTR-NN and GangSTR"
)

# chromosome + start_point to vid map for GangSTR
chr_start_to_vid = defaultdict(int)