def create_illumina_genotyping_references(illumina_read_dir='../Genotyping/'):
    from reference_vntr import load_unique_vntrs_data
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA'}
    repeats = {'GP1BA': range(1, 5), 'CSTB': range(1, 16), 'MAOA': range(1, 6)}
    repeats_patterns = {
        'GP1BA': [
            'AGCCCGACCACCCCAGAGCCCACCTCAGAGCCCGCCCCC',
            'AGCCCGACCACCCCGGAGCCCACCTCAGAGCCCGCCCCC',
            'AGCCCGACCACCCCGGAGCCCACCCCAATCCCGACCATCGCCA'
        ],
        'CSTB':
        ['CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGGCGGGCGGGG'],
        'MAOA': [
            'ACCGGCACCGGCACCAGTACCCGCACCAGT', 'ACCGGCACCGGCACCGAGCGCAAGGCGGAG',
            'ACCGGCACCGGCACCAGTACCCGCACCAGT'
        ]
    }

    for vntr_id in id_to_gene.keys():
        # if vntr_id != 1221:
        #     continue
        for repeat in repeats[id_to_gene[vntr_id]]:
            outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.pacfa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 3000,
                repeats_patterns[id_to_gene[vntr_id]])
def create_illumina_copy_number_variation_references(
        illumina_read_dir='../Illumina_copy_number/'):
    from reference_vntr import load_unique_vntrs_data
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {
        119: 'DRD4',
        1220: 'GP1BA',
        1221: 'CSTB',
        1214: 'MAOA',
        1219: 'IL1RN'
    }
    repeats = {
        'DRD4': range(1, 12),
        'GP1BA': range(1, 6),
        'CSTB': range(1, 16),
        'MAOA': range(1, 6),
        'IL1RN': range(1, 10)
    }

    for vntr_id in id_to_gene.keys():
        for repeat in repeats[id_to_gene[vntr_id]]:
            outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.fa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 149)
def check_trio_consistency(father_file, mother_file, child_file):
    father_genotypes = get_genotypes(father_file)
    mother_genotypes = get_genotypes(mother_file)
    child_genotypes = get_genotypes(child_file)

    from reference_vntr import load_unique_vntrs_data
    reference_vntrs = load_unique_vntrs_data()

    vntr_ids = set(father_genotypes.keys() + mother_genotypes.keys() +
                   child_genotypes.keys())
    print('Total vntrs: %s' % len(vntr_ids))
    inconsistents = []
    consistents = []
    for vid in vntr_ids:
        if not is_consistent(vid, father_genotypes, mother_genotypes,
                             child_genotypes):
            inconsistents.append(
                len(reference_vntrs[vid].get_repeat_segments()))
            # print (len(reference_vntrs[vid].pattern))
            if 83 <= len(reference_vntrs[vid].pattern) < 85:
                print reference_vntrs[vid].left_flanking_region[-50:]
            # print('%s: %s %s %s' % (vid, father_genotypes[vid], mother_genotypes[vid], child_genotypes[vid]))
        else:
            consistents.append(
                len((reference_vntrs[vid].get_repeat_segments())))

    print('Total inconsistencies: %s' % len(inconsistents))
    print(consistents)
    print(inconsistents)
Exemple #4
0
def get_pacbio_comparison_result():
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'}
    genes = glob.glob('../Pacbio_copy_number/*')
    for gene_dir in genes:
        print(gene_dir)
        files = glob.glob(gene_dir + '/*30x.fastq.sam')
        gene_name = gene_dir.split('/')[-1]
        mapped_reads = {}
        for file_name in files:
            copies = int(file_name.split('_')[-2])
            make_bam_and_index(file_name)
            base_name = file_name[:-4]
            original_bam = base_name + '.bam'
            bwasw_alignment(base_name)
            blasr_alignment(base_name)
            bwasw_alignment_file = base_name[:-3] + '_bwasw_aln.bam'
            blasr_alignment_file = base_name[:-3] + '_blasr_aln.bam'

            vntr_id = None
            for vid, gname in id_to_gene.items():
                if gname == gene_name:
                    vntr_id = vid

            ref_length = copies * len(reference_vntrs[vntr_id].pattern) + 2000
            true_ids = get_pacbio_true_read_ids(original_bam,
                                                reference_vntrs[vntr_id],
                                                ref_length)
            blasr_ids = get_id_of_reads_mapped_to_vntr_in_bamfile(
                blasr_alignment_file, reference_vntrs[vntr_id])
            bwasw_ids = get_id_of_reads_mapped_to_vntr_in_bamfile(
                bwasw_alignment_file, reference_vntrs[vntr_id])
            blasr_tp = [
                read_id for read_id in blasr_ids if read_id in true_ids
            ]
            bwasw_tp = [
                read_id for read_id in bwasw_ids if read_id in true_ids
            ]
            vntr_finder = VNTRFinder(reference_vntrs[vntr_id])
            our_filtering = get_out_pacbio_filtered_counts(
                base_name, vntr_finder)
            our_selection = our_filtering
            mapped_reads[copies] = [
                len(true_ids), our_filtering, our_selection,
                len(bwasw_tp),
                len(blasr_tp)
            ]
        with open(gene_dir + '/result.txt', 'w') as out:
            for copies in sorted(mapped_reads.iterkeys()):
                original, our_filtering, our_selection, bwasw, blasr = mapped_reads[
                    copies]
                out.write('%s %s %s %s %s %s\n' %
                          (copies, original, our_filtering, our_selection,
                           bwasw, blasr))
Exemple #5
0
def get_illumina_comparison_result():
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {
        119: 'DRD4',
        1220: 'GP1BA',
        1221: 'CSTB',
        1214: 'MAOA',
        1219: 'IL1RN'
    }
    genes = glob.glob('../Illumina_copy_number/*')
    for gene_dir in genes:
        print(gene_dir)
        files = glob.glob(gene_dir + '/*30x.sam')
        gene_name = gene_dir.split('/')[-1]
        print(len(files))
        mapped_reads = {}
        for file_name in files:
            copies = file_name.split('_')[-2]
            make_bam_and_index(file_name)
            base_name = file_name[:-4]
            bowtie_bam = bowtie_alignment(base_name + '.fq')
            bwa_bam = bwamem_alignment(base_name + '.fq')
            original_bam = file_name[:-4] + '.bam'

            vntr_id = None
            for vid, gname in id_to_gene.items():
                if gname == gene_name:
                    vntr_id = vid

            vntr_finder = VNTRFinder(reference_vntrs[vntr_id])
            original = count_reads(original_bam)
            our_selection = get_our_selected_reads_count(
                base_name + '.fq', vntr_finder)
            our_filtering = get_our_filtered_reads_count(
                base_name + '.fq', vntr_finder)
            bwa = count_reads_mapped_to_vntr_in_bamfile(
                bwa_bam, reference_vntrs[vntr_id])
            bowtie = count_reads_mapped_to_vntr_in_bamfile(
                bowtie_bam, reference_vntrs[vntr_id])
            mapped_reads[int(copies)] = [
                original, our_filtering, our_selection, bwa, bowtie
            ]
        with open(gene_dir + '/result.txt', 'w') as out:
            for copies in sorted(mapped_reads.iterkeys()):
                original, our_filtering, our_selection, bwa, bowtie = mapped_reads[
                    copies]
                out.write('%s %s %s %s %s %s\n' %
                          (copies, original, our_filtering, our_selection, bwa,
                           bowtie))
def create_pacbio_copy_number_variation_references(
        pacbio_read_dir='../pacbio_recruitment/set1/'):
    from reference_vntr import load_unique_vntrs_data
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'}
    repeats = {
        'CSTB': range(1, 69),
        'HIC1': range(2, 36),
        'INS': range(10, 171)
    }

    for vntr_id in id_to_gene.keys():
        for repeat in repeats[id_to_gene[vntr_id]]:
            if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0:
                continue
            if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0:
                continue
            outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.fa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 1000)
def create_pacbio_coverage_data_for_3_genes_and_10_cn(
        pacbio_read_dir='../pacbio_coverage_experiment/'):
    from reference_vntr import load_unique_vntrs_data
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'}
    repeats = {
        'CSTB': range(2, 42),
        'HIC1': range(2, 22),
        'INS': range(10, 110)
    }

    for vntr_id in id_to_gene.keys():
        for repeat in repeats[id_to_gene[vntr_id]]:
            if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0:
                continue
            if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0:
                continue
            if id_to_gene[vntr_id] != 'INS':
                continue
            outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str(
                repeat) + '.fa'
            create_reference_region_with_specific_repeats(
                reference_vntrs[vntr_id], repeat, outfile, 3000)
def create_pacbio_ru_length_data_for_all_vntrs(
        pacbio_read_dir='../pacbio_ru_data_for_all_vntrs/'):
    from reference_vntr import load_unique_vntrs_data
    reference_vntrs = load_unique_vntrs_data()

    with open('vntr_complex.txt') as infile:
        lines = infile.readlines()
        complex_vntrs = [int(r.strip().split()[0]) for r in lines] + [0]

    repeat_units = {}
    for vntr_id in range(len(reference_vntrs)):
        if vntr_id in complex_vntrs:
            continue
        ru = len(reference_vntrs[vntr_id].pattern)
        if ru not in repeat_units.keys():
            repeat_units[ru] = []
        if len(repeat_units[ru]) >= 4:
            continue
        repeat_units[ru].append(vntr_id)

    import os
    for ru in repeat_units.keys():
        if len(repeat_units[ru]) < 2:
            continue
        for vntr_id in repeat_units[ru]:
            original_repeats = len(
                reference_vntrs[vntr_id].get_repeat_segments())
            start = max(3, original_repeats - 10)
            for repeat in range(start, start + 21):
                if repeat % 5 != 0:
                    continue
                outfile = pacbio_read_dir + str(ru) + '/vntr_id_' + str(
                    vntr_id) + '_' + str(repeat) + '.fa'
                if not os.path.exists(os.path.dirname(outfile)):
                    os.makedirs(os.path.dirname(outfile))
                create_reference_region_with_specific_repeats(
                    reference_vntrs[vntr_id], repeat, outfile, 1000)
def find_info_by_mapping(sim_dir='simulation_data/', dir_index=0):
    reference_vntrs = load_unique_vntrs_data()
    id_to_gene = {119: 'DRD4', 1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA', 1219: 'IL1RN'}
    gene_to_length = {'DRD4': 528, 'GP1BA': 39, 'CSTB': 168, 'MAOA': 30}
    clean_up_tmp()
    dirs = glob.glob(sim_dir+'/*')
    simulation_dir = dirs[dir_index]
    files = glob.glob(simulation_dir + '/*')
    for fasta_file in files:
        if fasta_file.endswith('WGS_30x.fasta'):
            gene_name = simulation_dir.split('/')[-1].split('_')[0]
            vntr_id = None
            for vid, gname in id_to_gene.items():
                if gname == gene_name:
                    vntr_id = vid
            ref_vntr = reference_vntrs[vntr_id]

            true_reads_file = fasta_file[:-6] + '_true_reads.txt'
            simulated_sam_file = fasta_file[:-6] + '.sam'
            if not os.path.exists(true_reads_file):
                region = [ref_vntr.start_point, ref_vntr.start_point + gene_to_length[gene_name]]
                true_reads = get_id_of_reads_mapped_to_vntr_in_samfile(simulated_sam_file, ref_vntr, region=region)
                with open(true_reads_file, 'w') as out:
                    for true_read in true_reads:
                        out.write('%s\n' % true_read)
            else:
                with open(true_reads_file) as input:
                    lines = input.readlines()
                    true_reads = [line.strip() for line in lines if line.strip() != '']

            true_reads_hmm_scores = fasta_file[:-6] + '_t_reads_hmm_score.txt'
            false_reads_hmm_scores = fasta_file[:-6] + '_f_reads_hmm_score.txt'
            if not os.path.exists(true_reads_hmm_scores):
                write_hmm_scores(simulated_sam_file, true_reads_hmm_scores, false_reads_hmm_scores, ref_vntr, true_reads)

            for i, parameter in enumerate([10]):
                positive_file = fasta_file[:-6] + '_bwa_%s_positive_supplementary_reads.txt' % abs(parameter)
                false_negative_file = fasta_file[:-6] + '_bwa_%s_fn_supplementary_reads.txt' % abs(parameter)
                if os.path.exists(positive_file) and os.path.exists(false_negative_file):
                    continue
                bwa_alignment_file = '/tmp/_gene%s_' % dir_index + 'bwa_alignment_%s.sam' % i
                bwa_alignment(fasta_file, bwa_alignment_file, parameter)
                positive_reads, fn_reads = get_positive_and_fn_reads_from_samfile(bwa_alignment_file, ref_vntr, true_reads)
                save_reads_stat(positive_file, positive_reads)
                save_reads_stat(false_negative_file, fn_reads)

                clean_up_tmp()

            for i, parameter in enumerate([-0.6, -2]):
                if i == 0:
                    continue
                positive_file = fasta_file[:-6] + '_bowtie_%s_positive_supplementary_reads.txt' % abs(parameter)
                false_negative_file = fasta_file[:-6] + '_bowtie_%s_fn_supplementary_reads.txt' % abs(parameter)
                if os.path.exists(positive_file) and os.path.exists(false_negative_file):
                    continue
                bowtie_alignment_file = '/tmp/_gene%s_' % dir_index + 'bowtie_alignment_%s.sam' % i
                bowtie_alignment(fasta_file, bowtie_alignment_file, parameter)
                positive_reads, fn_reads = get_positive_and_fn_reads_from_samfile(bowtie_alignment_file, ref_vntr, true_reads)
                save_reads_stat(positive_file, positive_reads)
                save_reads_stat(false_negative_file, fn_reads)
                if gene_name == 'MAOA':
                    os.system('cp %s /pedigree2/projects/VeNTeR/bowtie_alignment_%s.sam' % (bowtie_alignment_file, i))

                clean_up_tmp()
Exemple #10
0
def genotype(args, genotype_parser):
    if args.alignment_file is None and args.fasta is None:
        print_error(
            genotype_parser,
            'ERROR: No input specified. Please specify alignment file or fasta file'
        )

    if args.nanopore:
        settings.MAX_ERROR_RATE = 0.3
    elif args.pacbio:
        settings.MAX_ERROR_RATE = 0.3
    else:
        settings.MAX_ERROR_RATE = 0.05

    if args.threads < 1:
        print_error(genotype_parser, 'ERROR: threads cannot be less than 1')
    settings.CORES = args.threads

    input_file = args.alignment_file if args.alignment_file else args.fasta
    input_is_alignment_file = input_file.endswith(
        'bam') or input_file.endswith('sam')
    working_directory = args.working_directory + '/' if args.working_directory else os.path.dirname(
        input_file) + '/'

    log_file = working_directory + 'log_%s.log' % os.path.basename(input_file)
    log_format = '%(asctime)s %(levelname)s:%(message)s'
    logging.basicConfig(format=log_format,
                        filename=log_file,
                        level=logging.DEBUG,
                        filemode='w')

    reference_vntrs = load_unique_vntrs_data()
    # reference_vntrs = identify_homologous_vntrs(reference_vntrs, 'chr15')
    # illumina_targets = [1214, 1220, 1221, 1222, 1223, 1224, 377, 378, 809]
    illumina_targets = [532789, 188871, 301645, 600000]

    target_vntrs = []
    for i in range(len(reference_vntrs)):
        if not reference_vntrs[i].is_non_overlapping(
        ) or reference_vntrs[i].has_homologous_vntr():
            continue
        target_vntrs.append(i)

    if args.vntr_id is not None:
        target_vntrs = [int(vid) for vid in args.vntr_id.split(',')]
    else:
        target_vntrs = illumina_targets
    genome_analyzier = GenomeAnalyzer(reference_vntrs, target_vntrs,
                                      working_directory)
    if args.pacbio:
        if input_is_alignment_file:
            genome_analyzier.find_repeat_counts_from_pacbio_alignment_file(
                input_file)
        else:
            genome_analyzier.find_repeat_counts_from_pacbio_reads(
                input_file, args.naive)
    else:
        if args.frameshift:
            if valid_vntr_for_frameshift(target_vntrs):
                genome_analyzier.find_frameshift_from_alignment_file(
                    input_file)
            else:
                genotype_parser.error(
                    '--frameshift is only available for these IDs: %s' %
                    settings.FRAMESHIFT_VNTRS)
        elif input_is_alignment_file:
            genome_analyzier.find_repeat_counts_from_alignment_file(input_file)
        else:
            genome_analyzier.find_repeat_counts_from_short_reads(input_file)