def prep_assembly(vcf_parsed, out, sample_name, raw_reads, contig_reads_dir): """Prepare reads for local assembly""" logging.info("Prepare reads for local assembly") # extract read IDs read_ids = os.path.join(out, sample_name + ".id") with open(vcf_parsed, "r") as input, open(read_ids, "w") as output: for line in input: entry = line.replace('\n', '').split("\t") read_list = entry[8].split(",") for read in read_list: output.write(read + "\n") # generate unique ID list read_ids_unique = read_ids + ".unique" command = "cat " + read_ids + " | sort | uniq" with open(read_ids_unique, "w") as output: subprocess.call(command, stdout=output, shell=True) # filter raw reads using read list subset_fa = os.path.join(out, sample_name + ".subset.fa") command = "seqtk subseq " + raw_reads + \ " " + read_ids_unique + " | seqtk seq -a" with open(subset_fa, "w") as output: subprocess.call(command, stdout=output, shell=True) # reorder reads subset_fa_reorder = out + "/" + sample_name + ".subset.reorder.fa" extract_reads(subset_fa, read_ids, subset_fa_reorder) # separate reads into multiple files, using csplit mkdir(contig_reads_dir) csplit_prefix = contig_reads_dir + '/contig' m = [] k = 1 with open(vcf_parsed, "r") as input: for line in input: entry = line.replace('\n', '').split("\t") k = k + 2 * (len(entry[8].split(","))) m.append(k) if len(m) == 1: subprocess.call( ["cp", subset_fa_reorder, contig_reads_dir + '/contig0']) elif len(m) == 0: print("No insertion detected, exiting...") else: m = m[:-1] index = " ".join(str(i) for i in m) command = "csplit -s -f " + csplit_prefix + \ " -n 1 " + subset_fa_reorder + " " + index subprocess.call(command, shell=True) # remove tmp files os.remove(read_ids) os.remove(read_ids_unique) os.remove(subset_fa) os.remove(subset_fa_reorder)
def local_assembly(contig_dir, vcf_parsed, out, sample_name, raw_reads, thread, presets, polish): """Perform local assembly using reads from parsed VCF file""" # Prepare reads used for local assembly contig_reads_dir = os.path.join(out, "contig_reads") prep_assembly(vcf_parsed, out, sample_name, raw_reads, contig_reads_dir) mkdir(contig_dir) if presets == "ont": presets_wtdbg2 = "ont" presets_minimap2 = "map-ont" else: presets_wtdbg2 = "rs" presets_minimap2 = "map-pb" k = 0 asm_pa_list = [] with open(vcf_parsed, "r") as input: for line in input: entry = line.replace('\n', '').split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) contig_reads = contig_reads_dir + "/contig" + str(k) # rename contig reads contig_reads_rename = contig_reads_dir + "/" + contig_name + ".reads.fa" os.rename(contig_reads, contig_reads_rename) thread_asm = 1 asm_pa = [ contig_reads_rename, contig_dir, contig_name, thread_asm, presets_wtdbg2, presets_minimap2, polish ] asm_pa_list.append(asm_pa) k = k + 1 # run assembly in parallel logging.info("Perform local assembly of non-reference TE loci...") start_time = time.time() try: pool = Pool(processes=thread) pool.map(run_wtdbg2, asm_pa_list) pool.close() pool.join() except Exception as e: print(e) print("Local assembly failed, exiting...") sys.exit(1) proc_time = time.time() - start_time logging.info("Local assembly finished in " + format_time(proc_time))
def main(): args = get_args() # logging config formatstr = "%(asctime)s: %(levelname)s: %(message)s" datestr = "%m/%d/%Y %H:%M:%S" logging.basicConfig( level=logging.DEBUG, filename=os.path.join(args.out, "TELR.log"), filemode="w", format=formatstr, datefmt=datestr, ) logging.info("CMD: " + " ".join(sys.argv)) start_time = time.time() # create directory for intermediate files tmp_dir = os.path.join(args.out, "intermediate_files") mkdir(tmp_dir) # Parse input sample_name = os.path.splitext(os.path.basename(args.reads))[0] reads, reference, fasta, skip_alignment = parse_input( args.reads, args.reference, sample_name, tmp_dir) # # Alignment bam = os.path.join(tmp_dir, sample_name + "_sort.bam") if not skip_alignment: alignment(bam, fasta, reference, tmp_dir, sample_name, args.thread, args.presets) else: sort_index_bam(reads, bam, args.thread) # initialize loci eveluation file loci_eval = os.path.join(args.out, sample_name + ".loci_eval.tsv") if os.path.isfile(loci_eval): os.remove(loci_eval) # Detect and parse SV vcf = os.path.join(tmp_dir, sample_name + ".vcf") detect_sv(vcf, bam, reference, args.library, tmp_dir, sample_name, args.thread) # Parse SV and filter for TE candidate locus vcf_parsed = os.path.join(tmp_dir, sample_name + ".vcf_filtered.tsv") vcf_parse_filter( vcf, vcf_parsed, bam, args.library, tmp_dir, sample_name, args.thread, loci_eval, ) # Local assembly contig_dir = os.path.join(tmp_dir, "contig_assembly") local_assembly( contig_dir, vcf_parsed, tmp_dir, sample_name, fasta, args.thread, args.presets, args.polish, ) # Annotate contig for TE region ( contig_te_annotation, contig_rm_annotation, te_freq, te_fa, merge_contigs, ) = annotate_contig( contig_dir, args.library, vcf_parsed, tmp_dir, sample_name, args.thread, args.presets, loci_eval, ) # find TEs report_meta = find_te( contig_te_annotation, contig_rm_annotation, te_freq, merge_contigs, reference, tmp_dir, sample_name, args.gap, args.overlap, args.presets, loci_eval, ) # generate output files generate_output(report_meta, te_fa, vcf_parsed, args.out, sample_name, reference) # clean tmp files if not args.keep_files: shutil.rmtree(tmp_dir) proc_time = time.time() - start_time print("TELR finished!") logging.info("TELR finished in " + format_time(proc_time))
def get_args(): parser = argparse.ArgumentParser( description="Script to detect build phylogeny from TE sequences" ) optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") # required optional.add_argument( "--family", type=str, help="TE families (separated by comma)", required=True, ) required.add_argument( "--telr_dirs", type=str, help="list of TELR output directories", nargs="+", required=True, ) required.add_argument( "--consensus", type=str, help="TE consensus sequence", required=True, ) # optional optional.add_argument( "--out", type=str, help="directory to output data (default = '.')", required=False, ) optional.add_argument( "--thread", type=int, help="max cpu threads to use (default = '1')", required=False, ) optional.add_argument( "--bootstrap", type=int, help="bootstrap number (only apply when raxml is used for creating phylogeny)", required=False, ) optional.add_argument( "--method", type=str, help="method to create phylogeny, raxml/iqtree/both (default: raxml)", required=False, ) optional.add_argument( "--add_consensus", # TODO action="store_true", help="If provided then add consensus sequence to the phylogeny (default: don't add consensus)", required=False, ) optional.add_argument( "--allow_nested", # TODO action="store_true", help="If provided then allow nested/composite sequences in the phylogeny (default: don't allow)", required=False, ) optional.add_argument( "--length_filter", type=float, help="percentage of TE sequence longer or shorter than consensus sequence (default: 10%%)", required=False, ) optional.add_argument( "--divergence_filter", type=float, help="percentage of TE sequence divergent from consensus sequence (default: 10%%)", required=False, ) parser._action_groups.append(optional) args = parser.parse_args() # checks if in files exist try: test = open(args.consensus, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.consensus) sys.exit(1) # sets up out dir variable if args.out is None: args.out = "." args.out = os.path.abspath(args.out) mkdir(args.out) # set up default value for optional arguments if args.thread is None: args.thread = 1 if args.method is None: args.method = "iqtree" elif args.method != "raxml" and args.method != "iqtree" and args.method != "both": print("method not recognized, please check help page") sys.exit(1) if args.length_filter is None: args.length_filter = 1 if args.divergence_filter is None: args.divergence_filter = 0.1 if args.bootstrap is None: args.bootstrap = 5 return args
def prep_assembly(vcf_parsed, out, sample_name, bam, raw_reads, reads_dir, read_type="sv"): """Prepare reads for local assembly""" # logging.info("Prepare reads for local assembly") if read_type == "sv": # extract read IDs read_ids = os.path.join(out, sample_name + ".id") with open(vcf_parsed, "r") as input, open(read_ids, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") read_list = entry[8].split(",") for read in read_list: output.write(read + "\n") else: window = 1000 samfile = pysam.AlignmentFile(bam, "rb") read_ids = os.path.join(out, sample_name + ".id") vcf_parsed_new = vcf_parsed + ".new" with open(vcf_parsed, "r") as input, open(read_ids, "w") as output, open( vcf_parsed_new, "w") as VCF: for line in input: entry = line.replace("\n", "").split("\t") # get sniffles read list read_list = entry[8].split(",") reads_sniffles = set(read_list) ins_chr = entry[0] ins_breakpoint = round((int(entry[1]) + int(entry[2])) / 2) if ins_breakpoint < 1000: start = 0 end = ins_breakpoint + window else: start = ins_breakpoint - window end = ins_breakpoint + window reads = set() # coverage = 0 for read in samfile.fetch(ins_chr, start, end): reads.add(read.query_name) for read in reads: output.write(read + "\n") # write out_line = line.replace("\n", "") + "\t" + str(len(reads)) VCF.write(out_line + "\n") vcf_parsed = vcf_parsed_new # generate unique ID list read_ids_unique = read_ids + ".unique" command = "cat " + read_ids + " | sort | uniq" with open(read_ids_unique, "w") as output: subprocess.call(command, stdout=output, shell=True) # filter raw reads using read list subset_fa = os.path.join(out, sample_name + ".subset.fa") command = "seqtk subseq " + raw_reads + " " + read_ids_unique + " | seqtk seq -a" with open(subset_fa, "w") as output: subprocess.call(command, stdout=output, shell=True) # reorder reads subset_fa_reorder = out + "/" + sample_name + ".subset.reorder.fa" extract_reads(subset_fa, read_ids, subset_fa_reorder) # separate reads into multiple files, using csplit mkdir(reads_dir) csplit_prefix = reads_dir + "/contig" m = [] k = 1 with open(vcf_parsed, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") if read_type == "sv": k = k + 2 * (len(entry[8].split(","))) else: k = k + 2 * int(entry[14]) m.append(k) if len(m) == 1: subprocess.call(["cp", subset_fa_reorder, reads_dir + "/contig0"]) elif len(m) == 0: print("No insertion detected, exiting...") else: m = m[:-1] index = " ".join(str(i) for i in m) command = ("csplit -s -f " + csplit_prefix + " -n 1 " + subset_fa_reorder + " " + index) subprocess.call(command, shell=True) # remove tmp files os.remove(read_ids) os.remove(read_ids_unique) os.remove(subset_fa) os.remove(subset_fa_reorder)
def filter_vcf(ins, ins_filtered, te_library, out, sample_name, thread, loci_eval): """ Filter insertion sequences from Sniffles VCF by repeatmasking with TE concensus """ # constrct fasta from parsed vcf file ins_seqs = os.path.join(out, sample_name + ".vcf_ins.fasta") write_ins_seqs(ins, ins_seqs) # run RM on the inserted seqeunce repeatmasker_dir = os.path.join(out, "vcf_ins_repeatmask") mkdir(repeatmasker_dir) try: subprocess.call([ "RepeatMasker", "-dir", repeatmasker_dir, "-gff", "-s", "-nolow", "-no_is", "-xsmall", "-e", "ncbi", "-lib", te_library, "-pa", str(thread), ins_seqs, ]) ins_repeatmasked = os.path.join( repeatmasker_dir, os.path.basename(ins_seqs) + ".out.gff") open(ins_repeatmasked, "r") except Exception as e: print(e) print("Repeatmasking VCF insertion sequences failed, exiting...") sys.exit(1) # extract VCF sequences that contain TEs with open(ins_repeatmasked, "r") as input: ins_te_loci = { line.replace("\n", "").split("\t")[0] for line in input if "RepeatMasker" in line } with open(ins, "r") as input, open(ins_filtered, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) # TODO: maybe add filter for insertion sequences covered by TE? if contig_name in ins_te_loci: output.write(line) os.remove(ins_seqs) # report removed loci with open(loci_eval, "a") as output: for locus in create_loci_set(ins): if locus not in ins_te_loci: output.write( "\t".join([locus, "VCF sequence not repeatmasked"]) + "\n")
def annotate_contig(contig_dir, te_library, vcf_parsed, out, sample_name, thread, presets, loci_eval): logging.info("Annotate contigs...") if presets == "ont": presets = "map-ont" else: presets = "map-pb" all_loci = create_loci_set(vcf_parsed) assembly_passed_loci = set() merge_contigs = os.path.join(out, sample_name + ".contigs.fa") with open(merge_contigs, "w") as output: for locus in all_loci: assembly = os.path.join(contig_dir, locus + ".cns.fa") if os.path.isfile(assembly) and os.stat(assembly).st_size > 0: assembly_passed_loci.add(locus) with open(assembly, "r") as handle: records = SeqIO.parse(handle, "fasta") for record in records: if record.id == "ctg1": record.id = locus record.description = "len=" + str(len(record.seq)) SeqIO.write(record, output, "fasta") # report assembly failed loci with open(loci_eval, "a") as output: for locus in all_loci: if locus not in assembly_passed_loci: output.write("\t".join([locus, "Contig assembly failed"]) + "\n") # map sequence to contigs seq2contig_out = os.path.join(out, "seq2contig.paf") if os.path.isfile(seq2contig_out): os.remove(seq2contig_out) # TODO: consider that some contigs might not exist seq2contig_passed_loci = set() seq2contig_dir = os.path.join(out, "seq2contig") seq2contig = os.path.join(out, "seq2contig.paf") mkdir(seq2contig_dir) with open(vcf_parsed, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) if contig_name in assembly_passed_loci: vcf_seq = entry[7] query = os.path.join(seq2contig_dir, contig_name + ".seq.fa") create_fa(contig_name, vcf_seq, query) subject = os.path.join(seq2contig_dir, contig_name + ".contig.fa") with open(subject, "w") as output: try: subprocess.call( ["samtools", "faidx", merge_contigs, contig_name], stdout=output, ) except subprocess.CalledProcessError: print(contig_name + ":contig assembly doesn't exist") continue seq2contig_output = subprocess.check_output([ "minimap2", "-cx", presets, "--secondary=no", "-v", "0", subject, query, ]) seq2contig_output = seq2contig_output.decode("utf-8") if seq2contig_output != "": seq2contig_passed_loci.add(contig_name) with open(seq2contig, "a") as output: output.write(seq2contig_output) os.remove(query) os.remove(subject) os.rmdir(seq2contig_dir) # covert to bed format seq2contig_bed = os.path.join(out, "seq2contig.bed") with open(seq2contig, "r") as input, open(seq2contig_bed, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") bed_line = "\t".join( [entry[0], entry[7], entry[8], entry[5], entry[11], entry[4]]) output.write(bed_line + "\n") # report ins-contig failed loci with open(loci_eval, "a") as output: for locus in assembly_passed_loci: if locus not in seq2contig_passed_loci: output.write("\t".join([ locus, "Sniffles VCF sequence not mapped to assembled contig" ]) + "\n") # map TE library to contigs using minimap2 # TE-contig alignment te2contig_out = os.path.join(out, sample_name + ".te2contig.paf") if os.path.isfile(te2contig_out): os.remove(te2contig_out) for locus in seq2contig_passed_loci: contig_fa = os.path.join(out, locus + ".fa") with open(contig_fa, "w") as output: subprocess.call(["samtools", "faidx", merge_contigs, locus], stdout=output) # map TE library to contig using minimap2 with open(te2contig_out, "a") as output: subprocess.call( [ "minimap2", "-cx", presets, contig_fa, te_library, "-v", "0", "-t", str(thread), ], stdout=output, ) os.remove(contig_fa) # convert to bed format te2contig_bed = os.path.join(out, sample_name + ".te2contig.bed") with open(te2contig_out, "r") as input, open(te2contig_bed, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") bed_line = "\t".join( [entry[5], entry[7], entry[8], entry[0], entry[11], entry[4]]) output.write(bed_line + "\n") # Use VCF sequence alignment to filter minimap2 TE-contig alignment te2contig_filter_raw = os.path.join(out, sample_name + ".te2contig_filter.tsv") with open(te2contig_filter_raw, "w") as output: subprocess.call( [ "bedtools", "intersect", "-a", te2contig_bed, "-b", seq2contig_bed, "-wao", ], stdout=output, ) # filter and merge # get rid of -1 and make it into bed format te2contig_filter_tmp_bed = os.path.join( out, sample_name + ".te2contig_filter.tmp.bed") with open(te2contig_filter_raw, "r") as input, open(te2contig_filter_tmp_bed, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") # the overlap between VCF sequence alignment and TE-contig alignment has to be over 10bp if int(entry[12]) > 10: out_line = "\t".join([ entry[0], entry[1], entry[2], entry[3], entry[4], entry[5] ]) output.write(out_line + "\n") # sort te2contig_filter_tmp_sort_bed = (out + "/" + sample_name + ".te2contig_filter.tmp.sort.bed") command = "bedtools sort -i " + te2contig_filter_tmp_bed with open(te2contig_filter_tmp_sort_bed, "w") as output: subprocess.call(command, shell=True, stdout=output) # find out what's filtered out seq_mm2_overlap_loci = set() with open(te2contig_filter_tmp_sort_bed, "r") as input: for line in input: seq_mm2_overlap_loci.add(line.split("\t")[0]) # seq_mm2_overlap_loci = create_loci_set(te2contig_filter_tmp_sort_bed) with open(loci_eval, "a") as output: for locus in seq2contig_passed_loci: if locus not in seq_mm2_overlap_loci: output.write("\t".join([ locus, "VCF sequence doesn't overlap contig annotation" ]) + "\n") # merge contig_te_annotation = out + "/" + sample_name + ".te2contig_filter.bed" command = ( 'bedtools merge -d 10000 -c 4,6 -o distinct,distinct -delim "|" -i ' + te2contig_filter_tmp_sort_bed) with open(contig_te_annotation, "w") as output: subprocess.call(command, shell=True, stdout=output) # seq_mm2_overlap_merge_loci = create_loci_set(contig_te_annotation) # remove tmp files os.remove(seq2contig) os.remove(te2contig_bed) os.remove(te2contig_out) os.remove(seq2contig_bed) os.remove(te2contig_filter_raw) os.remove(te2contig_filter_tmp_bed) os.remove(te2contig_filter_tmp_sort_bed) # extract sequence and RM te_fa = out + "/" + sample_name + ".te.fa" with open(te_fa, "w") as output: subprocess.call( [ "bedtools", "getfasta", "-fi", merge_contigs, "-bed", contig_te_annotation, ], stdout=output, ) repeatmasker_dir = os.path.join(out, "contig_te_repeatmask") mkdir(repeatmasker_dir) try: subprocess.call([ "RepeatMasker", "-dir", repeatmasker_dir, "-gff", "-s", "-nolow", "-no_is", "-xsmall", "-e", "ncbi", "-lib", te_library, "-pa", str(thread), te_fa, ]) contig_te_repeatmasked = os.path.join( repeatmasker_dir, os.path.basename(te_fa) + ".out.gff") open(contig_te_repeatmasked, "r") except Exception as e: print(e) print("Repeatmasking contig TE sequences failed, exiting...") sys.exit(1) ## parse and merge te2contig_rm = out + "/" + sample_name + ".te2contig_rm.bed" with open(contig_te_repeatmasked, "r") as input, open(te2contig_rm, "w") as output: for line in input: if "##" not in line: entry = line.replace("\n", "").split("\t") contig_name = entry[0].rsplit(":", 1)[0] start = entry[0].rsplit(":", 1)[1].split("-")[0] end = entry[0].rsplit(":", 1)[1].split("-")[1] # contigs = entry[0].replace(':', '-').split("-") family = re.sub('Target "Motif:|".*', "", entry[8]) strand = entry[6] score = entry[5] out_line = "\t".join( [contig_name, start, end, family, score, strand]) output.write(out_line + "\n") print("Done\n") contig_rm_annotation = out + "/" + sample_name + ".te2contig_rm.merge.bed" command = 'bedtools merge -c 4,6 -o distinct -delim "|" -i ' + te2contig_rm with open(contig_rm_annotation, "w") as output: subprocess.call(command, shell=True, stdout=output) os.remove(te2contig_rm) # seq_mm2_overlap_merge_rm_loci = create_loci_set(te2contig_rm_merge) # with open(loci_eval, "a") as output: # for locus in seq_mm2_overlap_merge_loci: # if locus not in seq_mm2_overlap_merge_rm_loci: # print(locus, "contig seq RM failed") # build frequency dict te_freq = dict() with open(vcf_parsed, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) freq = entry[5] te_freq[contig_name] = freq return contig_te_annotation, contig_rm_annotation, te_freq, te_fa, merge_contigs
def get_args(): parser = argparse.ArgumentParser( description="Script to detect TEs in long read data") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") # required required.add_argument( "-i", "--reads", type=str, help="reads in fasta/fastq format or read alignments in bam format", required=True, ) required.add_argument( "-r", "--reference", type=str, help="reference genome in fasta format", required=True, ) required.add_argument( "-l", "--library", type=str, help="TE consensus sequences in fasta format", required=True, ) # optional optional.add_argument( "-x", "--presets", type=str, help= "parameter presets for different sequencing technologies (default = 'pacbio')", required=False, ) optional.add_argument( "-p", "--polish", type=int, help="rounds of contig polishing (default = 1)", required=False, ) optional.add_argument( "-o", "--out", type=str, help="directory to output data (default = '.')", required=False, ) optional.add_argument( "-t", "--thread", type=int, help="max cpu threads to use (default = '1')", required=False, ) optional.add_argument( "-g", "--gap", type=int, help="max gap size for flanking sequence alignment (default = '20')", required=False, ) optional.add_argument( "-v", "--overlap", type=int, help= "max overlap size for flanking sequence alignment (default = '20')", required=False, ) optional.add_argument( "-k", "--keep_files", action='store_true', help= "If provided then all intermediate files will be kept (default: remove intermediate files)", required=False, ) parser._action_groups.append(optional) args = parser.parse_args() # checks if in files exist try: test = open(args.reads, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.reads) sys.exit(1) try: test = open(args.reference, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.reference) sys.exit(1) try: test = open(args.library, "r") except Exception as e: print(e) logging.exception("Can not open input file: " + args.library) sys.exit(1) if args.presets is None: args.presets = "pacbio" # sets up out dir variable if args.out is None: args.out = "." args.out = os.path.abspath(args.out) mkdir(args.out) if args.thread is None: args.thread = 1 if args.polish is None: args.polish = 1 if args.gap is None: args.gap = 20 if args.overlap is None: args.overlap = 20 return args
def filter_vcf(ins, ins_filtered, te_library, out, sample_name, thread, loci_eval): """ Filter insertion sequences from Sniffles VCF by repeatmasking with TE concensus """ # constrct fasta from parsed vcf file ins_seqs = os.path.join(out, sample_name + ".vcf_ins.fasta") write_ins_seqs(ins, ins_seqs) # get the length of the insertion sequence TODO: this can be generalized contig_len = dict() if os.path.isfile(ins_seqs): with open(ins_seqs, "r") as handle: records = SeqIO.parse(handle, "fasta") for record in records: contig_len[record.id] = len(record.seq) # run RM on the inserted seqeunce repeatmasker_dir = os.path.join(out, "vcf_ins_repeatmask") mkdir(repeatmasker_dir) try: subprocess.call( [ "RepeatMasker", "-dir", repeatmasker_dir, "-gff", "-s", "-nolow", "-no_is", "-xsmall", "-e", "ncbi", "-lib", te_library, "-pa", str(thread), ins_seqs, ] ) ins_repeatmasked = os.path.join( repeatmasker_dir, os.path.basename(ins_seqs) + ".out.gff" ) open(ins_repeatmasked, "r") except Exception as e: print(e) print("Repeatmasking VCF insertion sequences failed, exiting...") sys.exit(1) # merge RM gff ins_rm_merge = os.path.join( repeatmasker_dir, os.path.basename(ins_seqs) + ".out.merge.bed" ) with open(ins_rm_merge, "w") as output: subprocess.call(["bedtools", "merge", "-i", ins_repeatmasked], stdout=output) # extract VCF sequences that contain TEs ins_te_loci = dict() with open(ins_rm_merge, "r") as input: for line in input: entry = line.replace("\n", "").split("\t") contig_name = entry[0] length = int(entry[2]) - int(entry[1]) ins_te_prop = round(length / contig_len[contig_name], 2) if contig_name in ins_te_loci: ins_te_loci[contig_name] = ins_te_loci[contig_name] + ins_te_prop else: ins_te_loci[contig_name] = ins_te_prop with open(ins, "r") as input, open(ins_filtered, "w") as output: for line in input: entry = line.replace("\n", "").split("\t") contig_name = "_".join([entry[0], entry[1], entry[2]]) # TODO: maybe add filter for insertion sequences covered by TE? if contig_name in ins_te_loci: out_line = line.replace("\n", "") + "\t" + str(ins_te_loci[contig_name]) output.write(out_line + "\n") # os.remove(ins_seqs) # report removed loci with open(loci_eval, "a") as output: for locus in create_loci_set(ins): if locus not in ins_te_loci: output.write("\t".join([locus, "VCF sequence not repeatmasked"]) + "\n")