Ejemplo n.º 1
0
def prep_assembly(vcf_parsed, out, sample_name, raw_reads, contig_reads_dir):
    """Prepare reads for local assembly"""
    logging.info("Prepare reads for local assembly")
    # extract read IDs
    read_ids = os.path.join(out, sample_name + ".id")
    with open(vcf_parsed, "r") as input, open(read_ids, "w") as output:
        for line in input:
            entry = line.replace('\n', '').split("\t")
            read_list = entry[8].split(",")
            for read in read_list:
                output.write(read + "\n")

    # generate unique ID list
    read_ids_unique = read_ids + ".unique"
    command = "cat " + read_ids + " | sort | uniq"
    with open(read_ids_unique, "w") as output:
        subprocess.call(command, stdout=output, shell=True)

    # filter raw reads using read list
    subset_fa = os.path.join(out, sample_name + ".subset.fa")
    command = "seqtk subseq " + raw_reads + \
        " " + read_ids_unique + " | seqtk seq -a"
    with open(subset_fa, "w") as output:
        subprocess.call(command, stdout=output, shell=True)

    # reorder reads
    subset_fa_reorder = out + "/" + sample_name + ".subset.reorder.fa"
    extract_reads(subset_fa, read_ids, subset_fa_reorder)

    # separate reads into multiple files, using csplit
    mkdir(contig_reads_dir)
    csplit_prefix = contig_reads_dir + '/contig'
    m = []
    k = 1
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace('\n', '').split("\t")
            k = k + 2 * (len(entry[8].split(",")))
            m.append(k)
    if len(m) == 1:
        subprocess.call(
            ["cp", subset_fa_reorder, contig_reads_dir + '/contig0'])
    elif len(m) == 0:
        print("No insertion detected, exiting...")
    else:
        m = m[:-1]
        index = " ".join(str(i) for i in m)
        command = "csplit -s -f " + csplit_prefix + \
            " -n 1 " + subset_fa_reorder + " " + index
        subprocess.call(command, shell=True)

    # remove tmp files
    os.remove(read_ids)
    os.remove(read_ids_unique)
    os.remove(subset_fa)
    os.remove(subset_fa_reorder)
Ejemplo n.º 2
0
def local_assembly(contig_dir, vcf_parsed, out, sample_name, raw_reads, thread,
                   presets, polish):
    """Perform local assembly using reads from parsed VCF file"""

    # Prepare reads used for local assembly
    contig_reads_dir = os.path.join(out, "contig_reads")
    prep_assembly(vcf_parsed, out, sample_name, raw_reads, contig_reads_dir)

    mkdir(contig_dir)

    if presets == "ont":
        presets_wtdbg2 = "ont"
        presets_minimap2 = "map-ont"
    else:
        presets_wtdbg2 = "rs"
        presets_minimap2 = "map-pb"

    k = 0
    asm_pa_list = []
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace('\n', '').split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            contig_reads = contig_reads_dir + "/contig" + str(k)
            # rename contig reads
            contig_reads_rename = contig_reads_dir + "/" + contig_name + ".reads.fa"
            os.rename(contig_reads, contig_reads_rename)
            thread_asm = 1
            asm_pa = [
                contig_reads_rename, contig_dir, contig_name, thread_asm,
                presets_wtdbg2, presets_minimap2, polish
            ]
            asm_pa_list.append(asm_pa)
            k = k + 1
    # run assembly in parallel
    logging.info("Perform local assembly of non-reference TE loci...")
    start_time = time.time()
    try:
        pool = Pool(processes=thread)
        pool.map(run_wtdbg2, asm_pa_list)
        pool.close()
        pool.join()
    except Exception as e:
        print(e)
        print("Local assembly failed, exiting...")
        sys.exit(1)

    proc_time = time.time() - start_time
    logging.info("Local assembly finished in " + format_time(proc_time))
Ejemplo n.º 3
0
def main():
    args = get_args()
    # logging config
    formatstr = "%(asctime)s: %(levelname)s: %(message)s"
    datestr = "%m/%d/%Y %H:%M:%S"
    logging.basicConfig(
        level=logging.DEBUG,
        filename=os.path.join(args.out, "TELR.log"),
        filemode="w",
        format=formatstr,
        datefmt=datestr,
    )
    logging.info("CMD: " + " ".join(sys.argv))
    start_time = time.time()

    # create directory for intermediate files
    tmp_dir = os.path.join(args.out, "intermediate_files")
    mkdir(tmp_dir)

    # Parse input
    sample_name = os.path.splitext(os.path.basename(args.reads))[0]
    reads, reference, fasta, skip_alignment = parse_input(
        args.reads, args.reference, sample_name, tmp_dir)

    # # Alignment
    bam = os.path.join(tmp_dir, sample_name + "_sort.bam")
    if not skip_alignment:
        alignment(bam, fasta, reference, tmp_dir, sample_name, args.thread,
                  args.presets)
    else:
        sort_index_bam(reads, bam, args.thread)

    # initialize loci eveluation file
    loci_eval = os.path.join(args.out, sample_name + ".loci_eval.tsv")
    if os.path.isfile(loci_eval):
        os.remove(loci_eval)

    # Detect and parse SV
    vcf = os.path.join(tmp_dir, sample_name + ".vcf")
    detect_sv(vcf, bam, reference, args.library, tmp_dir, sample_name,
              args.thread)

    # Parse SV and filter for TE candidate locus
    vcf_parsed = os.path.join(tmp_dir, sample_name + ".vcf_filtered.tsv")
    vcf_parse_filter(
        vcf,
        vcf_parsed,
        bam,
        args.library,
        tmp_dir,
        sample_name,
        args.thread,
        loci_eval,
    )

    # Local assembly
    contig_dir = os.path.join(tmp_dir, "contig_assembly")
    local_assembly(
        contig_dir,
        vcf_parsed,
        tmp_dir,
        sample_name,
        fasta,
        args.thread,
        args.presets,
        args.polish,
    )

    # Annotate contig for TE region
    (
        contig_te_annotation,
        contig_rm_annotation,
        te_freq,
        te_fa,
        merge_contigs,
    ) = annotate_contig(
        contig_dir,
        args.library,
        vcf_parsed,
        tmp_dir,
        sample_name,
        args.thread,
        args.presets,
        loci_eval,
    )

    # find TEs
    report_meta = find_te(
        contig_te_annotation,
        contig_rm_annotation,
        te_freq,
        merge_contigs,
        reference,
        tmp_dir,
        sample_name,
        args.gap,
        args.overlap,
        args.presets,
        loci_eval,
    )

    # generate output files
    generate_output(report_meta, te_fa, vcf_parsed, args.out, sample_name,
                    reference)

    # clean tmp files
    if not args.keep_files:
        shutil.rmtree(tmp_dir)

    proc_time = time.time() - start_time
    print("TELR finished!")
    logging.info("TELR finished in " + format_time(proc_time))
Ejemplo n.º 4
0
def get_args():
    parser = argparse.ArgumentParser(
        description="Script to detect build phylogeny from TE sequences"
    )
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")

    # required
    optional.add_argument(
        "--family",
        type=str,
        help="TE families (separated by comma)",
        required=True,
    )

    required.add_argument(
        "--telr_dirs",
        type=str,
        help="list of TELR output directories",
        nargs="+",
        required=True,
    )

    required.add_argument(
        "--consensus",
        type=str,
        help="TE consensus sequence",
        required=True,
    )

    # optional
    optional.add_argument(
        "--out",
        type=str,
        help="directory to output data (default = '.')",
        required=False,
    )
    optional.add_argument(
        "--thread",
        type=int,
        help="max cpu threads to use (default = '1')",
        required=False,
    )
    optional.add_argument(
        "--bootstrap",
        type=int,
        help="bootstrap number (only apply when raxml is used for creating phylogeny)",
        required=False,
    )
    optional.add_argument(
        "--method",
        type=str,
        help="method to create phylogeny, raxml/iqtree/both (default: raxml)",
        required=False,
    )
    optional.add_argument(
        "--add_consensus",  # TODO
        action="store_true",
        help="If provided then add consensus sequence to the phylogeny (default: don't add consensus)",
        required=False,
    )
    optional.add_argument(
        "--allow_nested",  # TODO
        action="store_true",
        help="If provided then allow nested/composite sequences in the phylogeny (default: don't allow)",
        required=False,
    )
    optional.add_argument(
        "--length_filter",
        type=float,
        help="percentage of TE sequence longer or shorter than consensus sequence (default: 10%%)",
        required=False,
    )
    optional.add_argument(
        "--divergence_filter",
        type=float,
        help="percentage of TE sequence divergent from consensus sequence (default: 10%%)",
        required=False,
    )
    parser._action_groups.append(optional)
    args = parser.parse_args()

    # checks if in files exist
    try:
        test = open(args.consensus, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.consensus)
        sys.exit(1)

    # sets up out dir variable
    if args.out is None:
        args.out = "."
    args.out = os.path.abspath(args.out)
    mkdir(args.out)

    # set up default value for optional arguments
    if args.thread is None:
        args.thread = 1

    if args.method is None:
        args.method = "iqtree"
    elif args.method != "raxml" and args.method != "iqtree" and args.method != "both":
        print("method not recognized, please check help page")
        sys.exit(1)

    if args.length_filter is None:
        args.length_filter = 1

    if args.divergence_filter is None:
        args.divergence_filter = 0.1

    if args.bootstrap is None:
        args.bootstrap = 5

    return args
Ejemplo n.º 5
0
def prep_assembly(vcf_parsed,
                  out,
                  sample_name,
                  bam,
                  raw_reads,
                  reads_dir,
                  read_type="sv"):
    """Prepare reads for local assembly"""
    # logging.info("Prepare reads for local assembly")

    if read_type == "sv":
        # extract read IDs
        read_ids = os.path.join(out, sample_name + ".id")
        with open(vcf_parsed, "r") as input, open(read_ids, "w") as output:
            for line in input:
                entry = line.replace("\n", "").split("\t")
                read_list = entry[8].split(",")
                for read in read_list:
                    output.write(read + "\n")
    else:
        window = 1000
        samfile = pysam.AlignmentFile(bam, "rb")
        read_ids = os.path.join(out, sample_name + ".id")
        vcf_parsed_new = vcf_parsed + ".new"
        with open(vcf_parsed,
                  "r") as input, open(read_ids, "w") as output, open(
                      vcf_parsed_new, "w") as VCF:
            for line in input:
                entry = line.replace("\n", "").split("\t")

                # get sniffles read list
                read_list = entry[8].split(",")
                reads_sniffles = set(read_list)

                ins_chr = entry[0]
                ins_breakpoint = round((int(entry[1]) + int(entry[2])) / 2)
                if ins_breakpoint < 1000:
                    start = 0
                    end = ins_breakpoint + window
                else:
                    start = ins_breakpoint - window
                    end = ins_breakpoint + window

                reads = set()
                # coverage = 0
                for read in samfile.fetch(ins_chr, start, end):
                    reads.add(read.query_name)
                for read in reads:
                    output.write(read + "\n")

                # write
                out_line = line.replace("\n", "") + "\t" + str(len(reads))
                VCF.write(out_line + "\n")
                vcf_parsed = vcf_parsed_new

    # generate unique ID list
    read_ids_unique = read_ids + ".unique"
    command = "cat " + read_ids + " | sort | uniq"
    with open(read_ids_unique, "w") as output:
        subprocess.call(command, stdout=output, shell=True)

    # filter raw reads using read list
    subset_fa = os.path.join(out, sample_name + ".subset.fa")
    command = "seqtk subseq " + raw_reads + " " + read_ids_unique + " | seqtk seq -a"
    with open(subset_fa, "w") as output:
        subprocess.call(command, stdout=output, shell=True)

    # reorder reads
    subset_fa_reorder = out + "/" + sample_name + ".subset.reorder.fa"
    extract_reads(subset_fa, read_ids, subset_fa_reorder)

    # separate reads into multiple files, using csplit
    mkdir(reads_dir)
    csplit_prefix = reads_dir + "/contig"
    m = []
    k = 1
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            if read_type == "sv":
                k = k + 2 * (len(entry[8].split(",")))
            else:
                k = k + 2 * int(entry[14])
            m.append(k)
    if len(m) == 1:
        subprocess.call(["cp", subset_fa_reorder, reads_dir + "/contig0"])
    elif len(m) == 0:
        print("No insertion detected, exiting...")
    else:
        m = m[:-1]
        index = " ".join(str(i) for i in m)
        command = ("csplit -s -f " + csplit_prefix + " -n 1 " +
                   subset_fa_reorder + " " + index)
        subprocess.call(command, shell=True)

    # remove tmp files
    os.remove(read_ids)
    os.remove(read_ids_unique)
    os.remove(subset_fa)
    os.remove(subset_fa_reorder)
Ejemplo n.º 6
0
def filter_vcf(ins, ins_filtered, te_library, out, sample_name, thread,
               loci_eval):
    """
    Filter insertion sequences from Sniffles VCF by repeatmasking with TE concensus
    """
    # constrct fasta from parsed vcf file
    ins_seqs = os.path.join(out, sample_name + ".vcf_ins.fasta")
    write_ins_seqs(ins, ins_seqs)

    # run RM on the inserted seqeunce
    repeatmasker_dir = os.path.join(out, "vcf_ins_repeatmask")
    mkdir(repeatmasker_dir)
    try:
        subprocess.call([
            "RepeatMasker",
            "-dir",
            repeatmasker_dir,
            "-gff",
            "-s",
            "-nolow",
            "-no_is",
            "-xsmall",
            "-e",
            "ncbi",
            "-lib",
            te_library,
            "-pa",
            str(thread),
            ins_seqs,
        ])
        ins_repeatmasked = os.path.join(
            repeatmasker_dir,
            os.path.basename(ins_seqs) + ".out.gff")
        open(ins_repeatmasked, "r")
    except Exception as e:
        print(e)
        print("Repeatmasking VCF insertion sequences failed, exiting...")
        sys.exit(1)

    # extract VCF sequences that contain TEs
    with open(ins_repeatmasked, "r") as input:
        ins_te_loci = {
            line.replace("\n", "").split("\t")[0]
            for line in input if "RepeatMasker" in line
        }

    with open(ins, "r") as input, open(ins_filtered, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            # TODO: maybe add filter for insertion sequences covered by TE?
            if contig_name in ins_te_loci:
                output.write(line)
    os.remove(ins_seqs)

    # report removed loci
    with open(loci_eval, "a") as output:
        for locus in create_loci_set(ins):
            if locus not in ins_te_loci:
                output.write(
                    "\t".join([locus, "VCF sequence not repeatmasked"]) + "\n")
Ejemplo n.º 7
0
def annotate_contig(contig_dir, te_library, vcf_parsed, out, sample_name,
                    thread, presets, loci_eval):
    logging.info("Annotate contigs...")
    if presets == "ont":
        presets = "map-ont"
    else:
        presets = "map-pb"

    all_loci = create_loci_set(vcf_parsed)
    assembly_passed_loci = set()
    merge_contigs = os.path.join(out, sample_name + ".contigs.fa")
    with open(merge_contigs, "w") as output:
        for locus in all_loci:
            assembly = os.path.join(contig_dir, locus + ".cns.fa")
            if os.path.isfile(assembly) and os.stat(assembly).st_size > 0:
                assembly_passed_loci.add(locus)
                with open(assembly, "r") as handle:
                    records = SeqIO.parse(handle, "fasta")
                    for record in records:
                        if record.id == "ctg1":
                            record.id = locus
                            record.description = "len=" + str(len(record.seq))
                            SeqIO.write(record, output, "fasta")

    # report assembly failed loci
    with open(loci_eval, "a") as output:
        for locus in all_loci:
            if locus not in assembly_passed_loci:
                output.write("\t".join([locus, "Contig assembly failed"]) +
                             "\n")

    # map sequence to contigs
    seq2contig_out = os.path.join(out, "seq2contig.paf")
    if os.path.isfile(seq2contig_out):
        os.remove(seq2contig_out)

    # TODO: consider that some contigs might not exist
    seq2contig_passed_loci = set()
    seq2contig_dir = os.path.join(out, "seq2contig")
    seq2contig = os.path.join(out, "seq2contig.paf")
    mkdir(seq2contig_dir)
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            if contig_name in assembly_passed_loci:
                vcf_seq = entry[7]
                query = os.path.join(seq2contig_dir, contig_name + ".seq.fa")
                create_fa(contig_name, vcf_seq, query)
                subject = os.path.join(seq2contig_dir,
                                       contig_name + ".contig.fa")
                with open(subject, "w") as output:
                    try:
                        subprocess.call(
                            ["samtools", "faidx", merge_contigs, contig_name],
                            stdout=output,
                        )
                    except subprocess.CalledProcessError:
                        print(contig_name + ":contig assembly doesn't exist")
                        continue
                seq2contig_output = subprocess.check_output([
                    "minimap2",
                    "-cx",
                    presets,
                    "--secondary=no",
                    "-v",
                    "0",
                    subject,
                    query,
                ])
                seq2contig_output = seq2contig_output.decode("utf-8")
                if seq2contig_output != "":
                    seq2contig_passed_loci.add(contig_name)
                    with open(seq2contig, "a") as output:
                        output.write(seq2contig_output)
                os.remove(query)
                os.remove(subject)
    os.rmdir(seq2contig_dir)
    # covert to bed format
    seq2contig_bed = os.path.join(out, "seq2contig.bed")
    with open(seq2contig, "r") as input, open(seq2contig_bed, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            bed_line = "\t".join(
                [entry[0], entry[7], entry[8], entry[5], entry[11], entry[4]])
            output.write(bed_line + "\n")

    # report ins-contig failed loci
    with open(loci_eval, "a") as output:
        for locus in assembly_passed_loci:
            if locus not in seq2contig_passed_loci:
                output.write("\t".join([
                    locus,
                    "Sniffles VCF sequence not mapped to assembled contig"
                ]) + "\n")

    # map TE library to contigs using minimap2
    # TE-contig alignment
    te2contig_out = os.path.join(out, sample_name + ".te2contig.paf")
    if os.path.isfile(te2contig_out):
        os.remove(te2contig_out)
    for locus in seq2contig_passed_loci:
        contig_fa = os.path.join(out, locus + ".fa")
        with open(contig_fa, "w") as output:
            subprocess.call(["samtools", "faidx", merge_contigs, locus],
                            stdout=output)
        # map TE library to contig using minimap2
        with open(te2contig_out, "a") as output:
            subprocess.call(
                [
                    "minimap2",
                    "-cx",
                    presets,
                    contig_fa,
                    te_library,
                    "-v",
                    "0",
                    "-t",
                    str(thread),
                ],
                stdout=output,
            )
        os.remove(contig_fa)
    # convert to bed format
    te2contig_bed = os.path.join(out, sample_name + ".te2contig.bed")
    with open(te2contig_out, "r") as input, open(te2contig_bed, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            bed_line = "\t".join(
                [entry[5], entry[7], entry[8], entry[0], entry[11], entry[4]])
            output.write(bed_line + "\n")

    # Use VCF sequence alignment to filter minimap2 TE-contig alignment
    te2contig_filter_raw = os.path.join(out,
                                        sample_name + ".te2contig_filter.tsv")
    with open(te2contig_filter_raw, "w") as output:
        subprocess.call(
            [
                "bedtools",
                "intersect",
                "-a",
                te2contig_bed,
                "-b",
                seq2contig_bed,
                "-wao",
            ],
            stdout=output,
        )

    # filter and merge
    # get rid of -1 and make it into bed format
    te2contig_filter_tmp_bed = os.path.join(
        out, sample_name + ".te2contig_filter.tmp.bed")
    with open(te2contig_filter_raw,
              "r") as input, open(te2contig_filter_tmp_bed, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            # the overlap between VCF sequence alignment and TE-contig alignment has to be over 10bp
            if int(entry[12]) > 10:
                out_line = "\t".join([
                    entry[0], entry[1], entry[2], entry[3], entry[4], entry[5]
                ])
                output.write(out_line + "\n")
    # sort
    te2contig_filter_tmp_sort_bed = (out + "/" + sample_name +
                                     ".te2contig_filter.tmp.sort.bed")
    command = "bedtools sort -i " + te2contig_filter_tmp_bed
    with open(te2contig_filter_tmp_sort_bed, "w") as output:
        subprocess.call(command, shell=True, stdout=output)

    # find out what's filtered out
    seq_mm2_overlap_loci = set()
    with open(te2contig_filter_tmp_sort_bed, "r") as input:
        for line in input:
            seq_mm2_overlap_loci.add(line.split("\t")[0])
    # seq_mm2_overlap_loci = create_loci_set(te2contig_filter_tmp_sort_bed)
    with open(loci_eval, "a") as output:
        for locus in seq2contig_passed_loci:
            if locus not in seq_mm2_overlap_loci:
                output.write("\t".join([
                    locus, "VCF sequence doesn't overlap contig annotation"
                ]) + "\n")

    # merge
    contig_te_annotation = out + "/" + sample_name + ".te2contig_filter.bed"
    command = (
        'bedtools merge -d 10000 -c 4,6 -o distinct,distinct -delim "|" -i ' +
        te2contig_filter_tmp_sort_bed)
    with open(contig_te_annotation, "w") as output:
        subprocess.call(command, shell=True, stdout=output)

    # seq_mm2_overlap_merge_loci = create_loci_set(contig_te_annotation)

    # remove tmp files
    os.remove(seq2contig)
    os.remove(te2contig_bed)
    os.remove(te2contig_out)
    os.remove(seq2contig_bed)
    os.remove(te2contig_filter_raw)
    os.remove(te2contig_filter_tmp_bed)
    os.remove(te2contig_filter_tmp_sort_bed)

    # extract sequence and RM
    te_fa = out + "/" + sample_name + ".te.fa"
    with open(te_fa, "w") as output:
        subprocess.call(
            [
                "bedtools",
                "getfasta",
                "-fi",
                merge_contigs,
                "-bed",
                contig_te_annotation,
            ],
            stdout=output,
        )
    repeatmasker_dir = os.path.join(out, "contig_te_repeatmask")
    mkdir(repeatmasker_dir)
    try:
        subprocess.call([
            "RepeatMasker",
            "-dir",
            repeatmasker_dir,
            "-gff",
            "-s",
            "-nolow",
            "-no_is",
            "-xsmall",
            "-e",
            "ncbi",
            "-lib",
            te_library,
            "-pa",
            str(thread),
            te_fa,
        ])
        contig_te_repeatmasked = os.path.join(
            repeatmasker_dir,
            os.path.basename(te_fa) + ".out.gff")
        open(contig_te_repeatmasked, "r")
    except Exception as e:
        print(e)
        print("Repeatmasking contig TE sequences failed, exiting...")
        sys.exit(1)

    ## parse and merge
    te2contig_rm = out + "/" + sample_name + ".te2contig_rm.bed"
    with open(contig_te_repeatmasked, "r") as input, open(te2contig_rm,
                                                          "w") as output:
        for line in input:
            if "##" not in line:
                entry = line.replace("\n", "").split("\t")
                contig_name = entry[0].rsplit(":", 1)[0]
                start = entry[0].rsplit(":", 1)[1].split("-")[0]
                end = entry[0].rsplit(":", 1)[1].split("-")[1]
                # contigs = entry[0].replace(':', '-').split("-")
                family = re.sub('Target "Motif:|".*', "", entry[8])
                strand = entry[6]
                score = entry[5]
                out_line = "\t".join(
                    [contig_name, start, end, family, score, strand])
                output.write(out_line + "\n")
    print("Done\n")

    contig_rm_annotation = out + "/" + sample_name + ".te2contig_rm.merge.bed"
    command = 'bedtools merge -c 4,6 -o distinct -delim "|" -i ' + te2contig_rm
    with open(contig_rm_annotation, "w") as output:
        subprocess.call(command, shell=True, stdout=output)
    os.remove(te2contig_rm)

    # seq_mm2_overlap_merge_rm_loci = create_loci_set(te2contig_rm_merge)
    # with open(loci_eval, "a") as output:
    #     for locus in seq_mm2_overlap_merge_loci:
    #         if locus not in seq_mm2_overlap_merge_rm_loci:
    #             print(locus, "contig seq RM failed")

    # build frequency dict
    te_freq = dict()
    with open(vcf_parsed, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            freq = entry[5]
            te_freq[contig_name] = freq

    return contig_te_annotation, contig_rm_annotation, te_freq, te_fa, merge_contigs
Ejemplo n.º 8
0
def get_args():
    parser = argparse.ArgumentParser(
        description="Script to detect TEs in long read data")
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")

    # required
    required.add_argument(
        "-i",
        "--reads",
        type=str,
        help="reads in fasta/fastq format or read alignments in bam format",
        required=True,
    )
    required.add_argument(
        "-r",
        "--reference",
        type=str,
        help="reference genome in fasta format",
        required=True,
    )
    required.add_argument(
        "-l",
        "--library",
        type=str,
        help="TE consensus sequences in fasta format",
        required=True,
    )

    # optional
    optional.add_argument(
        "-x",
        "--presets",
        type=str,
        help=
        "parameter presets for different sequencing technologies (default = 'pacbio')",
        required=False,
    )
    optional.add_argument(
        "-p",
        "--polish",
        type=int,
        help="rounds of contig polishing (default = 1)",
        required=False,
    )
    optional.add_argument(
        "-o",
        "--out",
        type=str,
        help="directory to output data (default = '.')",
        required=False,
    )
    optional.add_argument(
        "-t",
        "--thread",
        type=int,
        help="max cpu threads to use (default = '1')",
        required=False,
    )
    optional.add_argument(
        "-g",
        "--gap",
        type=int,
        help="max gap size for flanking sequence alignment (default = '20')",
        required=False,
    )
    optional.add_argument(
        "-v",
        "--overlap",
        type=int,
        help=
        "max overlap size for flanking sequence alignment (default = '20')",
        required=False,
    )
    optional.add_argument(
        "-k",
        "--keep_files",
        action='store_true',
        help=
        "If provided then all intermediate files will be kept (default: remove intermediate files)",
        required=False,
    )
    parser._action_groups.append(optional)
    args = parser.parse_args()

    # checks if in files exist
    try:
        test = open(args.reads, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.reads)
        sys.exit(1)

    try:
        test = open(args.reference, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.reference)
        sys.exit(1)

    try:
        test = open(args.library, "r")
    except Exception as e:
        print(e)
        logging.exception("Can not open input file: " + args.library)
        sys.exit(1)

    if args.presets is None:
        args.presets = "pacbio"

    # sets up out dir variable
    if args.out is None:
        args.out = "."
    args.out = os.path.abspath(args.out)
    mkdir(args.out)

    if args.thread is None:
        args.thread = 1

    if args.polish is None:
        args.polish = 1

    if args.gap is None:
        args.gap = 20

    if args.overlap is None:
        args.overlap = 20

    return args
Ejemplo n.º 9
0
def filter_vcf(ins, ins_filtered, te_library, out, sample_name, thread, loci_eval):
    """
    Filter insertion sequences from Sniffles VCF by repeatmasking with TE concensus
    """
    # constrct fasta from parsed vcf file
    ins_seqs = os.path.join(out, sample_name + ".vcf_ins.fasta")
    write_ins_seqs(ins, ins_seqs)

    # get the length of the insertion sequence TODO: this can be generalized
    contig_len = dict()
    if os.path.isfile(ins_seqs):
        with open(ins_seqs, "r") as handle:
            records = SeqIO.parse(handle, "fasta")
            for record in records:
                contig_len[record.id] = len(record.seq)

    # run RM on the inserted seqeunce
    repeatmasker_dir = os.path.join(out, "vcf_ins_repeatmask")
    mkdir(repeatmasker_dir)
    try:
        subprocess.call(
            [
                "RepeatMasker",
                "-dir",
                repeatmasker_dir,
                "-gff",
                "-s",
                "-nolow",
                "-no_is",
                "-xsmall",
                "-e",
                "ncbi",
                "-lib",
                te_library,
                "-pa",
                str(thread),
                ins_seqs,
            ]
        )
        ins_repeatmasked = os.path.join(
            repeatmasker_dir, os.path.basename(ins_seqs) + ".out.gff"
        )
        open(ins_repeatmasked, "r")
    except Exception as e:
        print(e)
        print("Repeatmasking VCF insertion sequences failed, exiting...")
        sys.exit(1)

    # merge RM gff
    ins_rm_merge = os.path.join(
        repeatmasker_dir, os.path.basename(ins_seqs) + ".out.merge.bed"
    )
    with open(ins_rm_merge, "w") as output:
        subprocess.call(["bedtools", "merge", "-i", ins_repeatmasked], stdout=output)

    # extract VCF sequences that contain TEs
    ins_te_loci = dict()
    with open(ins_rm_merge, "r") as input:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = entry[0]
            length = int(entry[2]) - int(entry[1])
            ins_te_prop = round(length / contig_len[contig_name], 2)
            if contig_name in ins_te_loci:
                ins_te_loci[contig_name] = ins_te_loci[contig_name] + ins_te_prop
            else:
                ins_te_loci[contig_name] = ins_te_prop

    with open(ins, "r") as input, open(ins_filtered, "w") as output:
        for line in input:
            entry = line.replace("\n", "").split("\t")
            contig_name = "_".join([entry[0], entry[1], entry[2]])
            # TODO: maybe add filter for insertion sequences covered by TE?
            if contig_name in ins_te_loci:
                out_line = line.replace("\n", "") + "\t" + str(ins_te_loci[contig_name])
                output.write(out_line + "\n")
    # os.remove(ins_seqs)

    # report removed loci
    with open(loci_eval, "a") as output:
        for locus in create_loci_set(ins):
            if locus not in ins_te_loci:
                output.write("\t".join([locus, "VCF sequence not repeatmasked"]) + "\n")