def main(): vntr_map = {} if hg38: reference_vntrs = load_unique_vntrs_data( 'vntr_data/hg38_selected_VNTRs_Illumina.db') vntr_ids = [] for ref_vntr in reference_vntrs: vntr_map[ref_vntr.id] = ref_vntr if 100 >= len(ref_vntr.pattern) >= 6: vntr_ids.append(ref_vntr.id) else: reference_vntrs = load_unique_vntrs_data() for ref_vntr in reference_vntrs: vntr_map[ref_vntr.id] = ref_vntr from advntr.advntr_commands import get_tested_vntrs vntr_ids = get_tested_vntrs() print('len of reference_vntrs:', len(reference_vntrs)) print('# of vntrs: %s' % len(vntr_ids)) start, end = int(sys.argv[2]), int(sys.argv[3]) # run_simulation(vntr_map, 503431) # exit(0) count = 0 for vid in vntr_ids: count += 1 if count < start or count > end: continue run_simulation(vntr_map, vid)
def create_illumina_genotyping_references(illumina_read_dir='../Genotyping/'): reference_vntrs = load_unique_vntrs_data() id_to_gene = {1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA'} repeats = {'GP1BA': range(1, 5), 'CSTB': range(1, 16), 'MAOA': range(1, 6)} repeats_patterns = { 'GP1BA': [ 'AGCCCGACCACCCCAGAGCCCACCTCAGAGCCCGCCCCC', 'AGCCCGACCACCCCGGAGCCCACCTCAGAGCCCGCCCCC', 'AGCCCGACCACCCCGGAGCCCACCCCAATCCCGACCATCGCCA' ], 'CSTB': ['CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGGCGGGCGGGG'], 'MAOA': [ 'ACCGGCACCGGCACCAGTACCCGCACCAGT', 'ACCGGCACCGGCACCGAGCGCAAGGCGGAG', 'ACCGGCACCGGCACCAGTACCCGCACCAGT' ] } for vntr_id in id_to_gene.keys(): # if vntr_id != 1221: # continue for repeat in repeats[id_to_gene[vntr_id]]: outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.pacfa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 3000, repeats_patterns[id_to_gene[vntr_id]])
def generate_pairwise_aln(log_file, aln_file, ref_vntr_db=None, vntr_ids=None, sort_by_repeat=True): """ Generate pairwise alignment for each spanning reads :param log_file: a log file or a directory :param aln_file: output file name :param ref_vntr_db: reference VNTR database :param vntr_ids: VNTR id list that you want to generate alignment :param sort_by_repeat: if True, the reads will be sorted by the number of repeats """ # Load reference VNTRs reference_vntrs = load_unique_vntrs_data(ref_vntr_db) ref_vntrs = {ref_vntr.id: ref_vntr for ref_vntr in reference_vntrs} if os.path.isdir(log_file): log_files = glob.glob(log_file + "/log_*.log") for lf in log_files: if aln_file is not None: print( "ERROR: If log file is given as a directory, output name should be None" ) exit(-1) if aln_file is None: out_file = lf.split("/")[-1].split(".")[0] + ".aln" _generate_pairwise_aln(lf, out_file, ref_vntrs, vntr_ids, sort_by_repeat) else: if aln_file is None: out_file = log_file.split("/")[-1].split(".")[0] + ".aln" _generate_pairwise_aln(log_file, out_file, ref_vntrs, vntr_ids, sort_by_repeat)
def get_flakning_region_error_rate(log_file, out_file, ref_vntr_db, vntr_ids): # Load reference VNTRs reference_vntrs = load_unique_vntrs_data(ref_vntr_db) ref_vntrs = {ref_vntr.id: ref_vntr for ref_vntr in reference_vntrs} total_vid_repeat_flanking_errcount = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) total_vid_repeat_flanking_bpcount = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) if os.path.isdir(log_file): log_files = glob.glob(log_file + "/log_*.log") for lf in log_files: errcount_dict, bpcount_dict = _get_flanking_region_error_rate( lf, ref_vntrs, vntr_ids) for vid in bpcount_dict.keys(): for repeat_count in bpcount_dict[vid].keys(): for flanking in bpcount_dict[vid][repeat_count].keys(): total_vid_repeat_flanking_errcount[vid][repeat_count][ flanking] += errcount_dict[vid][repeat_count][ flanking] total_vid_repeat_flanking_bpcount[vid][repeat_count][ flanking] += bpcount_dict[vid][repeat_count][ flanking] else: total_vid_repeat_flanking_errcount, total_vid_repeat_flanking_bpcount = _get_flanking_region_error_rate( log_file, ref_vntrs, vntr_ids) with open(out_file, "w") as of: for vid in total_vid_repeat_flanking_bpcount.keys(): of.write("VID:{} ".format(vid)), of.write("REFRC:{} ".format(ref_vntrs[vid].estimated_repeats)) for repeat_count in sorted( total_vid_repeat_flanking_bpcount[vid].keys()): of.write("{}:".format(repeat_count)) of.write("{:.2f}/{:.2f} ".format( 1 - float(total_vid_repeat_flanking_errcount[vid][repeat_count] ['left']) / total_vid_repeat_flanking_bpcount[vid] [repeat_count]['left'], 1 - float(total_vid_repeat_flanking_errcount[vid][repeat_count] ['right']) / total_vid_repeat_flanking_bpcount[vid] [repeat_count]['right'])) of.write("\n")
def create_pacbio_copy_number_variation_references( pacbio_read_dir='../pacbio_recruitment/set1/'): reference_vntrs = load_unique_vntrs_data() id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'} repeats = { 'CSTB': range(1, 69), 'HIC1': range(2, 36), 'INS': range(10, 171) } for vntr_id in id_to_gene.keys(): for repeat in repeats[id_to_gene[vntr_id]]: if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0: continue if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0: continue outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.fa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 1000)
def view_model(args, viewmodel_parser): valid_characters = {'A', 'C', 'G', 'T'} if args.pattern: for element in set(args.pattern.upper()): if element not in valid_characters: print_error(viewmodel_parser, 'Pattern should only contain A, C, G, T') genes = [gene.upper() for gene in args.gene.split(',') if gene] reference_vntrs = load_unique_vntrs_data() results = [] for ref_vntr in reference_vntrs: if len(genes) and ref_vntr.gene_name not in genes: continue if args.pattern and ref_vntr.pattern != args.pattern.upper(): continue # if ref_vntr.get_length() > 130: # continue results.append(ref_vntr) print_models(results)
def create_pacbio_coverage_data_for_3_genes_and_10_cn( pacbio_read_dir='../pacbio_coverage_experiment/'): reference_vntrs = load_unique_vntrs_data() id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'} repeats = { 'CSTB': range(2, 42), 'HIC1': range(2, 22), 'INS': range(10, 110) } for vntr_id in id_to_gene.keys(): for repeat in repeats[id_to_gene[vntr_id]]: if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0: continue if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0: continue if id_to_gene[vntr_id] != 'INS': continue outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.fa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 3000)
def create_pacbio_ru_length_data_for_all_vntrs( pacbio_read_dir='../pacbio_ru_data_for_all_vntrs/'): reference_vntrs = load_unique_vntrs_data() with open('vntr_complex.txt') as infile: lines = infile.readlines() complex_vntrs = [int(r.strip().split()[0]) for r in lines] + [0] repeat_units = {} for vntr_id in range(len(reference_vntrs)): if vntr_id in complex_vntrs: continue ru = len(reference_vntrs[vntr_id].pattern) if ru not in repeat_units.keys(): repeat_units[ru] = [] if len(repeat_units[ru]) >= 4: continue repeat_units[ru].append(vntr_id) import os for ru in repeat_units.keys(): if len(repeat_units[ru]) < 2: continue for vntr_id in repeat_units[ru]: original_repeats = len( reference_vntrs[vntr_id].get_repeat_segments()) start = max(3, original_repeats - 10) for repeat in range(start, start + 21): if repeat % 5 != 0: continue outfile = pacbio_read_dir + str(ru) + '/vntr_id_' + str( vntr_id) + '_' + str(repeat) + '.fa' if not os.path.exists(os.path.dirname(outfile)): os.makedirs(os.path.dirname(outfile)) create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 1000)
def create_illumina_copy_number_variation_references( illumina_read_dir='../Illumina_copy_number/'): reference_vntrs = load_unique_vntrs_data() id_to_gene = { 119: 'DRD4', 1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA', 1219: 'IL1RN' } repeats = { 'DRD4': range(1, 12), 'GP1BA': range(1, 6), 'CSTB': range(1, 16), 'MAOA': range(1, 6), 'IL1RN': range(1, 10) } for vntr_id in id_to_gene.keys(): for repeat in repeats[id_to_gene[vntr_id]]: outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.fa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 149)
def genotype(args, genotype_parser): if args.alignment_file is None and args.fasta is None: print_error( genotype_parser, 'No input specified. Please specify alignment file or fasta file') if args.nanopore: settings.MAX_ERROR_RATE = 0.3 elif args.pacbio: settings.MAX_ERROR_RATE = 0.3 else: settings.MAX_ERROR_RATE = 0.05 if args.threads < 1: print_error(genotype_parser, 'threads cannot be less than 1') settings.CORES = args.threads if args.expansion and args.coverage is None: print_error( genotype_parser, 'Please specify the average coverage to identify the expansion') average_coverage = args.coverage if args.expansion else None input_file = args.alignment_file if args.alignment_file else args.fasta input_is_alignment_file = input_file.endswith( 'bam') or input_file.endswith('sam') or input_file.endswith('cram') if not input_is_alignment_file: print_error( genotype_parser, "The input file format is not supported. Please use BAM/CRAM files." ) if args.working_directory is None: print_error( genotype_parser, 'Please specify working directory by -wd or --working_directory') working_directory = args.working_directory + '/' if args.working_directory else os.path.dirname( input_file) + '/' log_file = working_directory + 'log_%s.log' % os.path.basename(input_file) log_format = '%(asctime)s %(levelname)s:%(message)s' logging.basicConfig(format=log_format, filename=log_file, level=logging.DEBUG, filemode='w') if args.outfile: sys.stdout = open(args.outfile, 'w') models_file = args.models if models_file is None: models_file = settings.ILLUMINA_DEFAULT_MODELS_FILE if args.pacbio: models_file = settings.PACBIO_DEFAULT_MODELS_FILE settings.TRAINED_MODELS_DB = models_file settings.TRAINED_HMMS_DIR = os.path.dirname( os.path.realpath(settings.TRAINED_MODELS_DB)) + '/' reference_vntrs = load_unique_vntrs_data() target_vntrs = [ref_vntr.id for ref_vntr in reference_vntrs] if args.vntr_id is not None: target_vntrs = [int(vid) for vid in args.vntr_id.split(',')] logging.info('Running adVNTR for %s VNTRs' % len(target_vntrs)) genome_analyzier = GenomeAnalyzer(reference_vntrs, target_vntrs, working_directory, args.outfmt, args.haploid, args.reference_filename, input_file) if args.pacbio: if input_is_alignment_file: genome_analyzier.find_repeat_counts_from_pacbio_alignment_file( input_file) else: genome_analyzier.find_repeat_counts_from_pacbio_reads( input_file, args.naive) else: if args.frameshift: if valid_vntr_for_frameshift(target_vntrs): genome_analyzier.find_frameshift_from_alignment_file( input_file) else: print_error(genotype_parser, '--frameshift is not available for these VNTRs') elif input_is_alignment_file: genome_analyzier.find_repeat_counts_from_alignment_file( input_file, average_coverage, args.update) else: genome_analyzier.find_repeat_counts_from_short_reads(input_file)
for rpkm in rpkms: thresholds[rpkm.split('.')[0]] = 0.0005 run_permutation_test = False bootstrapping = False highest_fs = 0 lowest_p = 1e10 try: gene_locations_obj = GeneLocations() except: pass if __name__ == '__main__': ref_vntrs = load_unique_vntrs_data(vntr_models_dir) reference_vntrs = {} for ref_vntr in ref_vntrs: reference_vntrs[ref_vntr.id] = ref_vntr def get_average(lst): return sum(lst) / len(lst) def get_wgs_id_to_individual_id_map(): try: with open(wgs_id_gtex_id_file) as infile: lines = infile.readlines() except: lines = []
def genotype(args, genotype_parser): if args.alignment_file is None and args.fasta is None: print_error( genotype_parser, 'No input specified. Please specify alignment file or fasta file') if args.nanopore: settings.MAX_ERROR_RATE = 0.3 elif args.pacbio: settings.MAX_ERROR_RATE = 0.3 else: settings.MAX_ERROR_RATE = 0.05 if args.threads < 1: print_error(genotype_parser, 'threads cannot be less than 1') settings.CORES = args.threads if args.expansion and args.coverage is None: print_error( genotype_parser, 'Please specify the average coverage to identify the expansion') average_coverage = args.coverage if args.expansion else None input_file = args.alignment_file if args.alignment_file else args.fasta input_is_alignment_file = input_file.endswith( 'bam') or input_file.endswith('sam') working_directory = args.working_directory + '/' if args.working_directory else os.path.dirname( input_file) + '/' settings.BLAST_TMP_DIR = working_directory + settings.BLAST_TMP_RELATIVE_DIR log_file = working_directory + 'log_%s.log' % os.path.basename(input_file) log_format = '%(asctime)s %(levelname)s:%(message)s' logging.basicConfig(format=log_format, filename=log_file, level=logging.DEBUG, filemode='w') settings.TRAINED_MODELS_DB = args.models settings.TRAINED_HMMS_DIR = os.path.dirname( os.path.realpath(settings.TRAINED_MODELS_DB)) + '/' reference_vntrs = load_unique_vntrs_data() # reference_vntrs = identify_homologous_vntrs(reference_vntrs, 'chr15') illumina_targets = [532789, 188871, 301645, 600000] target_vntrs = [] for i in range(len(reference_vntrs)): if not reference_vntrs[i].is_non_overlapping( ) or reference_vntrs[i].has_homologous_vntr(): continue target_vntrs.append(reference_vntrs[i].id) if args.vntr_id is not None: target_vntrs = [int(vid) for vid in args.vntr_id.split(',')] else: target_vntrs = illumina_targets genome_analyzier = GenomeAnalyzer(reference_vntrs, target_vntrs, working_directory, is_haploid=args.haploid) if args.pacbio: if input_is_alignment_file: genome_analyzier.find_repeat_counts_from_pacbio_alignment_file( input_file) else: genome_analyzier.find_repeat_counts_from_pacbio_reads( input_file, args.naive) else: if args.frameshift: if valid_vntr_for_frameshift(target_vntrs): genome_analyzier.find_frameshift_from_alignment_file( input_file) else: print_error(genotype_parser, '--frameshift is not available for these VNTRs') elif input_is_alignment_file: genome_analyzier.find_repeat_counts_from_alignment_file( input_file, average_coverage, args.update) else: genome_analyzier.find_repeat_counts_from_short_reads(input_file)
# Output: Accuracy for each heterozygous scenario. from advntr import models from collections import defaultdict advntr_db = '/home/jonghun/advntr_vcf/adVNTR/vntr_data/hg38_selected_VNTRs_Illumina.db' #################################################################### # Get similar VNTR IDs for filtering similar_vntrs_hg38 = set() with open("similar_vntrs_hg38_maxlen1000.txt", "r") as f: for line in f: similar_vntrs_hg38.add(int(line)) # Read genotype results for those and the target VNTRs reference_vntrs = models.load_unique_vntrs_data(advntr_db) ref_vntrs = {ref_vntr.id: ref_vntr for ref_vntr in reference_vntrs} target_vntrs = [ ref_vntr for ref_vntr in reference_vntrs if 6 <= len(ref_vntr.pattern) <= 20 and ref_vntr.id not in similar_vntrs_hg38 ] target_vids = set([ref_vntr.id for ref_vntr in target_vntrs]) #################################################################### # Calculate accuracy for each secnario print( "This script calculates accuracies for each scenarios for adVNTR-NN and GangSTR" ) # chromosome + start_point to vid map for GangSTR chr_start_to_vid = defaultdict(int)