Beispiel #1
0
def generate_surv_params(param_file):
    logging.debug(f"Running SURVIVOR")
    ret = cmd_exe("SURVIVOR simSV {}".format(param_file))
    logging.debug(ret.stderr)
    logging.debug(ret.stdout)
    if ret.ret_code != 0:
        logging.error("Problem running SURVIVOR")
        logging.error(ret.stderr)
        exit(ret.ret_code)
Beispiel #2
0
def sim_reads_art(workdir, coverage=30, readlen=150, meanfrag=400, insertsd=50, instrument="HS25"):
    """
    Run art_illumina read simulator
    """
    ret = cmd_exe("which art_illumina")
    if ret.ret_code != 0:
        logging.error("Cannot fine art_illumina executable in the environment")
        exit(ret.retcode)
    try:
        os.chdir(workdir)
    except OSError:
        logging.error(f"Cannot change into {workdir} directory")
        exit(1)
    alt_ref = 'svteaser.altered.fa'
    ret = cmd_exe((f"art_illumina -ss {instrument} -sam -na -i {alt_ref} -p "
                   f"-l {readlen} -m {meanfrag} -s {insertsd} -f {coverage} -o art_illumina.simReads"))
    if ret.ret_code != 0:
        logging.error("Problem running art_illumina")
        logging.error(ret.stderr)
        logging.error(ret.stdout)
        exit(ret.ret_code)
Beispiel #3
0
    def run_trf(self, altseqs, refseqs=None):
        """
        Runs trf on the ref/alt sequences
        returns {'a': [althitsdict,..], 'r':[refhitsdict]}
        """
        def parse_output():
            """
            Parse the outputs from trf, turn to a dictionary
            """
            hits = defaultdict(list)
            with open(TRFAnno.TRNAME, 'r') as fh:
                name = fh.readline()
                if name == "":  # no hits
                    return hits
                name = name.strip()[1:]
                while True:
                    # If there are multiple, need to parameters for 'take best' or take top N or something
                    # Will need name now that there's ref/alt seq
                    data = fh.readline()
                    if data == "":
                        break
                    if data.startswith("@"):
                        name = data.strip()[1:]
                        continue
                    data = data.strip().split(' ')
                    data = {
                        x[0]: y
                        for x, y in zip(TRFAnno.TRFCOLS, data)
                        if not x[0].startswith("unk")
                    }
                    # don't really need until parallel
                    data["TRF_scores"] = int(data["TRF_scores"])
                    hits[name].append(data)
            return hits

        with open(TRFAnno.FANAME, 'w') as fout:
            for seq in altseqs:
                fout.write(">a\n%s\n" % (seq))
            for seq in refseqs:
                fout.write(">r\n%s\n" % (seq))

        ret = cmd_exe(self.cmd)
        if ret.ret_code != 0:
            logging.error("Couldn't run trf")
            logging.error(str(ret))
            exit(ret.ret_code)
        return parse_output()
Beispiel #4
0
def pcmd_exe(cmd):
    """
    Wraps a cmd_exe with set -o pipefail
    """
    return cmd_exe("set -o pipefail; " + cmd)
Beispiel #5
0
def vcf_compress(fn):
    """
    Run vcftools to sort/compress/index a vcf file
    """
    ret = cmd_exe(f"vcf-sort {fn} | bgzip > {fn}.gz && tabix {fn}.gz")
Beispiel #6
0
def find_survivor():
    ret = cmd_exe("SURVIVOR -h")
    if ret.ret_code != 0:
        logging.error("Cannot find SURVIVOR in environment")
        exit(ret.ret_code)
Beispiel #7
0
def process_regions(ref_file, regions, out_dir, param_file):
    out_vcf_path = os.path.join(out_dir, "svteaser.sim.vcf")
    out_ref_fa_path = os.path.join(out_dir, "svteaser.ref.fa")
    out_altered_fa_path = os.path.join(out_dir, "svteaser.altered.fa")

    out_vcf_fh = None
    out_ref_fa_fh = open(out_ref_fa_path, "w+")
    out_altered_fa_fh = open(out_altered_fa_path, "w+")

    ref = pysam.FastaFile(ref_file)

    # Define padding in reference region where SVs are not to be inserted.
    padding = 800

    for i, (chrom, start, end) in enumerate(regions):
        # Track status.
        if (i + 1) % 50 == 0:
            logging.info("Processed {}/{} regions...".format(i + 1, len(regions)))

        # Temporary dir.
        temp_dir = os.path.join(out_dir, "temp")
        os.mkdir(temp_dir)

        # Extract ref sequence.
        name = "{}_{}_{}".format(chrom, start, end)
        ref_seq = ref.fetch(chrom, start, end)

        # Remove some buffer from beginning and ending,
        # so that the tails do not contain SVs. These will be added
        # back later on.
        ref_seq_surv = ref_seq[padding:len(ref_seq)-padding]
        # Write ref sequence to temporary fa file.
        temp_ref_fa = os.path.join(temp_dir, "temp_ref.fa")
        with open(temp_ref_fa, "w") as fh:
            add_fasta_entry(name, ref_seq_surv, fh)

        # Run SURVIVOR.
        prefix = os.path.join(temp_dir, "simulated")
        survivor_cmd = " ".join(["SURVIVOR",
                                 "simSV",
                                 temp_ref_fa,
                                 param_file,
                                 "0.0",
                                 "0",
                                 prefix])
        ret = cmd_exe(survivor_cmd)
        # should be checking here

        # Read output of SURVIVOR
        altered_fa_path = "{}.fasta".format(prefix)
        insertions_fa_path = "{}.insertions.fa".format(prefix)
        sim_vcf = "{}.vcf".format(prefix)
        # Update VCF
        temp_vcf = os.path.join(temp_dir, "temp.vcf")
        update_vcf(temp_ref_fa, insertions_fa_path, sim_vcf, temp_vcf, pos_padding=padding)

        # Merge seqs and variants entries into single FA/VCF files
        # Add the initial and last 800bp back to the altered fasta
        altered_seq = pysam.FastaFile(altered_fa_path).fetch(name)
        altered_seq = update_altered_fa(ref_seq, altered_seq, padding)
        add_fasta_entry(name, altered_seq, out_altered_fa_fh)

        add_fasta_entry(name, ref_seq, out_ref_fa_fh)

        vcf_reader = pysam.VariantFile(temp_vcf)
        header = vcf_reader.header
        if not out_vcf_fh:
            out_vcf_fh = pysam.VariantFile(out_vcf_path, 'w', header=header)

        for record in vcf_reader:
            out_vcf_fh.write(record)

        # Remove temporary files.
        import shutil
        shutil.rmtree(temp_dir)

    out_altered_fa_fh.close()
    out_ref_fa_fh.close()
    out_vcf_fh.close()
    vcf_compress(out_vcf_path)