Esempio n. 1
0
def main() -> None:
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s: %(message)s")
    p = argparse.ArgumentParser()
    p.add_argument('--cache_dir',
                   default=util.DEFAULT_CACHE_DIR,
                   help='Benchmark cache dir')
    p.add_argument('--result_dir',
                   default=util.DEFAULT_RESULT_DIR,
                   help='Benchmark result dir')
    args = p.parse_args()

    util.s3_cache_files([
        util.REFERENCE_DIR + '/gencode.v26.whole_genes.fa',
        util.REFERENCE_DIR + '/all_pair_art_lod_gpair_merged.txt'
    ], args.cache_dir)
    for sample in util.TITRATION_SAMPLES:
        logging.info('Start benchmark %s', sample.name)
        result_dir = args.result_dir + '/' + sample.name
        try:
            os.makedirs(result_dir, 0o755)
        except:
            logging.error("mkdir %s failed", result_dir)
        if os.path.exists(result_dir + "/filtered.fa"):
            logging.info("Skip %s", result_dir)
            continue
        util.s3_cache_files(util.expand_fastq_files(sample.paths),
                            args.cache_dir)
        cached_r1 = ",".join([
            args.cache_dir + '/' + os.path.basename(fq.r1)
            for fq in sample.paths
        ])
        cached_r2 = ",".join([
            args.cache_dir + '/' + os.path.basename(fq.r2)
            for fq in sample.paths
        ])
        cached_ref = args.cache_dir + '/gencode.v26.whole_genes.fa'
        cached_cosmic_fusion = args.cache_dir + '/all_pair_art_lod_gpair_merged.txt'

        af4_args = [
            str(util.af4_path()), f'-log_dir={result_dir}', f'-pprof=:12345',
            f'-mutex-profile-rate=1000', f'-block-profile-rate=1000',
            f'-r1={cached_r1}', f'-r2={cached_r2}',
            f'-fasta-output={result_dir}/all.fa',
            f'-filtered-output={result_dir}/filtered.fa',
            f'-transcript={cached_ref}', f'-max-genes-per-kmer=2',
            f'-max-proximity-distance=1000', f'-max-proximity-genes=5',
            f'-unstranded-prep', f'-cosmic-fusion={cached_cosmic_fusion}'
        ]
        util.check_call(af4_args)
        logging.info('Finished benchmark %d: %s', sample.name)
        logging.info("Runtime stats: %s", util.run_stats(Path(result_dir)))
        for path in glob.glob(f'{args.cache_dir}/*rerun*'):
            try:
                os.remove(path)
            except:
                logging.error("failed to remove " + path)
Esempio n. 2
0
def main() -> None:
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s: %(message)s")
    p = argparse.ArgumentParser()
    p.add_argument('--cache_dir',
                   default=util.DEFAULT_CACHE_DIR,
                   help='Benchmark cache dir')
    p.add_argument('--result_dir',
                   default=util.DEFAULT_RESULT_DIR,
                   help='Benchmark result dir')
    p.add_argument('--starfusion_data_dir',
                   default='/scratch-nvme/starfusion',
                   help='Directory for expanding starfusion plug-n-play files')
    p.add_argument(
        '--run',
        action='append',
        choices=['af4', 'starfusion'],
        help='List of systems to run. If unset, run all the configured systems'
    )
    p.add_argument(
        '--starfusion_plug_n_play_targz',
        default=os.environ['HOME'] +
        '/GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz',
        help=
        'Tar.gz file of starfusion plug-n-play file. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required'
    )
    p.add_argument(
        '--starfusion_targz',
        default=os.environ['HOME'] + '/STAR-Fusion-v1.5.0.FULL.tar.gz',
        help=
        'Tar.gz file of starfusion source package. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required'
    )

    args = p.parse_args()
    if not args.run:
        args.run = ['af4', 'starfusion']

    for sample in util.RNA_SAMPLES:
        fastq_files: List[str] = []
        cached_file_pairs: List[util.FASTQPair] = []
        for fp in sample.paths:
            assert fp.r1.replace("R1", "R2") == fp.r2, fp.r2
            fastq_files += [fp.r1, fp.r2]
            cached_file_pairs.append(
                util.FASTQPair(
                    r1=args.cache_dir + '/' + os.path.basename(fp.r1),
                    r2=args.cache_dir + '/' + os.path.basename(fp.r2)))
        util.s3_cache_files(fastq_files, args.cache_dir)

        if 'af4' in args.run:
            run_af4(sample.name, cached_file_pairs, args)
        if 'starfusion' in args.run:
            run_starfusion(sample.name, cached_file_pairs, args)
Esempio n. 3
0
def run_af4(
    sample_name: str,
    cached_file_pairs: List[util.FASTQPair],
    cosmic_fusion_path: str,
    args: Any,
):
    ref_path = "s3://grail-publications/resources/gencode.v26.whole_genes.fa"
    util.s3_cache_files([ref_path, cosmic_fusion_path], args.cache_dir)

    cached_r1 = ",".join([
        args.cache_dir + "/" + os.path.basename(fp.r1)
        for fp in cached_file_pairs
    ])
    cached_r2 = ",".join([
        args.cache_dir + "/" + os.path.basename(fp.r2)
        for fp in cached_file_pairs
    ])
    for mode in ["denovo", "targeted"]:
        result_dir = args.result_dir + "/" + os.path.basename(sample_name +
                                                              "-" + mode)
        if os.path.exists(result_dir + "/filtered.fa"):
            logging.info("Skipping benchmark: %s", result_dir)
            continue
        logging.info("Start af4 benchmark: %s", result_dir)
        try:
            os.makedirs(result_dir, 0o755)
        except:
            logging.error("mkdir %s failed", result_dir)
        af4_args = [
            str(util.af4_path()),
            f"-log_dir={result_dir}",
            f"-pprof=:12345",
            f"-mutex-profile-rate=1000",
            f"-block-profile-rate=1000",
            f"-r1={cached_r1}",
            f"-r2={cached_r2}",
            f"-max-genes-per-kmer=2",
            f"-max-proximity-distance=1000",
            f"-max-proximity-genes=5",
            f"-fasta-output={result_dir}/all.fa",
            f"-filtered-output={result_dir}/filtered.fa",
            f"-transcript={args.cache_dir}/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa",
        ]
        if mode == "targeted":
            af4_args.append(
                f"-cosmic-fusion={args.cache_dir}/all_pair_art_lod_gpair_merged.txt"
            )
        util.check_call(af4_args)
        logging.info("Finished benchmark: %s", result_dir)
        logging.info("Runtime stats: %s", util.run_stats(Path(result_dir)))
Esempio n. 4
0
def main() -> None:
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s: %(messge)s")
    p = argparse.ArgumentParser()

    p.add_argument(
        "--starfusion_dir",
        default="/scratch-nvme/xyang/result/",
        help=
        "Starfusion result dir, which then contains individual sample result",
    )
    p.add_argument("--cache_dir",
                   default="/scratch-nvme/xyang/cache/",
                   help="cache dir")

    args = p.parse_args()
    local_truth_gpair = f"{args.cache_dir}/liu_gpair.txt"
    if not os.path.exists(local_truth_gpair):
        s3_cache_files([REFERENCE_DIR + "/liu_gpair.txt"], args.cache_dir)

    listOfFiles = list()
    for (dirpath, dirnames, filenames) in os.walk(args.starfusion_dir):
        listOfFiles += [os.path.join(dirpath, file) for file in filenames]

    # get all relevant starfusion result files
    filtered_results = [
        f for f in listOfFiles
        if f.find("star-fusion.fusion_predictions.abridged.tsv") != -1
    ]

    # compare with truth set
    true_gpairs = read_fusion_pair(local_truth_gpair)

    npos = len(true_gpairs)

    for f in filtered_results:
        print(f)
        results = read_fusion_pair(f)

        npredict = len(results)
        tp = len(true_gpairs.intersection(results))
        fn = npos - tp
        fp = npredict - tp

        precision = tp / (tp + fp)
        recall = tp / npos
        f1 = 2 * precision * recall / (precision + recall)
        f1 = "%.3f" % (f1)
        print(f"tp={tp}, fn={fn}, fp={fp}, f1={f1}")
Esempio n. 5
0
def main() -> None:
    logging.basicConfig(
        level=logging.DEBUG, format="%(asctime)s:%(levelname)s: %(message)s"
    )
    p = argparse.ArgumentParser()
    p.add_argument(
        "--cache_dir", default=util.DEFAULT_CACHE_DIR, help="Benchmark cache dir"
    )
    p.add_argument(
        "--result_dir", default=util.DEFAULT_RESULT_DIR, help="Benchmark result dir"
    )
    p.add_argument(
        "--starfusion_data_dir",
        default="/scratch-nvme/starfusion",
        help="Directory for expanding starfusion plug-n-play files",
    )
    p.add_argument(
        "--starfusion_plug_n_play_targz",
        default=os.environ["HOME"]
        + "/GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz",
        help="Tar.gz file of starfusion plug-n-play file. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required",
    )
    p.add_argument(
        "--starfusion_targz",
        default=os.environ["HOME"] + "/STAR-Fusion-v1.5.0.FULL.tar.gz",
        help="Tar.gz file of starfusion source package. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required",
    )

    args = p.parse_args()

    for sample in util.SIMULATED_SAMPLES:
        util.s3_cache_files([sample.path.r1, sample.path.r2], args.cache_dir)
        fastq_files: List[str] = []
        cached_file_pairs: List[util.FASTQPair] = []

        fastq_files += [sample.path.r1, sample.path.r2]
        cached_file_pairs.append(
            util.FASTQPair(
                r1=args.cache_dir + "/" + os.path.basename(sample.path.r1),
                r2=args.cache_dir + "/" + os.path.basename(sample.path.r2),
            )
        )
        print(cached_file_pairs)
        sample_name = str(sample.n) + "_" + str(sample.coverage)
        run_starfusion(sample_name, cached_file_pairs, args)
Esempio n. 6
0
def run_af4(sample_name: str, cached_file_pairs: List[util.FASTQPair],
            args: Any):
    ref_path = "s3://grail-publications/resources/gencode.v26.whole_genes.fa"
    cosmic_fusion_path = "s3://grail-publications/resources/all_pair_art_lod_gpair_merged.txt"
    util.s3_cache_files([ref_path, cosmic_fusion_path], args.cache_dir)

    cached_r1 = ",".join([
        args.cache_dir + '/' + os.path.basename(fp.r1)
        for fp in cached_file_pairs
    ])
    cached_r2 = ",".join([
        args.cache_dir + '/' + os.path.basename(fp.r2)
        for fp in cached_file_pairs
    ])
    for mode in ['denovo', 'targeted']:
        result_dir = args.result_dir + '/' + os.path.basename(sample_name +
                                                              '-' + mode)
        if os.path.exists(result_dir + "/filtered.fa"):
            logging.info('Skipping benchmark: %s', result_dir)
            continue
        logging.info('Start af4 benchmark: %s', result_dir)
        try:
            os.makedirs(result_dir, 0o755)
        except:
            logging.error("mkdir %s failed", result_dir)
        af4_args = [
            str(util.af4_path()), f'-log_dir={result_dir}', f'-pprof=:12345',
            f'-mutex-profile-rate=1000', f'-block-profile-rate=1000',
            f'-umi-in-read', f'-r1={cached_r1}', f'-r2={cached_r2}',
            f'-fasta-output={result_dir}/all.fa',
            f'-filtered-output={result_dir}/filtered.fa',
            f'-transcript={args.cache_dir}/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa'
        ]
        if mode == 'targeted':
            af4_args.append(
                f'-cosmic-fusion={args.cache_dir}/all_pair_art_lod_gpair_merged.txt'
            )
        util.check_call(af4_args)
        logging.info('Finished benchmark: %s', result_dir)
        logging.info("Runtime stats: %s", util.run_stats(Path(result_dir)))
Esempio n. 7
0
def main() -> None:
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s: %(message)s")
    p = argparse.ArgumentParser()
    p.add_argument("--cache_dir",
                   default=util.DEFAULT_CACHE_DIR,
                   help="Benchmark cache dir")
    p.add_argument("--result_dir",
                   default=util.DEFAULT_RESULT_DIR,
                   help="Benchmark result dir")
    p.add_argument(
        "--starfusion_data_dir",
        default="/scratch-nvme/starfusion",
        help="Directory for expanding starfusion plug-n-play files",
    )
    p.add_argument(
        "--run",
        action="append",
        choices=["af4", "starfusion"],
        help="List of systems to run. If unset, run all the configured systems",
    )
    p.add_argument(
        "--starfusion_plug_n_play_targz",
        default=os.environ["HOME"] +
        "/GRCh38_v27_CTAT_lib_Feb092018.plug-n-play.tar.gz",
        help=
        "Tar.gz file of starfusion plug-n-play file. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required",
    )
    p.add_argument(
        "--starfusion_targz",
        default=os.environ["HOME"] + "/STAR-Fusion-v1.5.0.FULL.tar.gz",
        help=
        "Tar.gz file of starfusion source package. https://github.com/STAR-Fusion/STAR-Fusion/wiki#data-resources-required",
    )
    p.add_argument(
        "--brca_data_dir",
        default="/scratch-nvme/xyang/brca_rnaseq_data",
        help="BT474, KPL4, MCF7, SKBR3 Breast cancer data directory",
    )

    args = p.parse_args()
    if not args.run:
        args.run = ["af4", "starfusion"]

    ## brca rna-seq for af4
    brca_samples = [
        os.path.join(args.brca_data_dir, s)
        for s in ["BT474", "KPL4", "MCF7", "SKBR3"]
    ]
    for s in brca_samples:
        if not os.path.exists(os.path.join(args.brca_data_dir, s)):
            util.check_call([
                "download_brca_data.py",
                "--odir",
                "/scratch-nvme/xyang/brca_rnaseq_data",
            ])

    cosmic_fusion_path = (
        "s3://grail-publications/2019-ISMB/references/all_art_lod_brca.txt")
    for sample in brca_samples:
        r1s: List[str] = []
        for fq in os.listdir(sample):
            if "_1" in fq:
                r1s.append(os.path.join(sample, fq))
        cached_file_pairs: List[util.FASTQPair] = []
        for r1 in r1s:
            assert os.path.exists(r1.replace("_1", "_2"))
            cached_file_pairs.append(
                util.FASTQPair(r1=r1, r2=r1.replace("_1", "_2")))
        print(os.path.basename(sample))
        print(cached_file_pairs)

        run_af4(os.path.basename(sample), cached_file_pairs,
                cosmic_fusion_path, args)

    ## cfrna for af4 and starfusion
    cosmic_fusion_path = (
        "s3://grail-publications/2019-ISMB/references/all_pair_art_lod_gpair_merged.txt"
    )
    for sample in util.RNA_SAMPLES:
        fastq_files: List[str] = []
        cached_file_pairs: List[util.FASTQPair] = []
        for fp in sample.paths:
            assert fp.r1.replace("R1", "R2") == fp.r2, fp.r2
            fastq_files += [fp.r1, fp.r2]
            cached_file_pairs.append(
                util.FASTQPair(
                    r1=args.cache_dir + "/" + os.path.basename(fp.r1),
                    r2=args.cache_dir + "/" + os.path.basename(fp.r2),
                ))
        util.s3_cache_files(fastq_files, args.cache_dir)

        if "af4" in args.run:
            run_af4(sample.name, cached_file_pairs, cosmic_fusion_path, args)
        if "starfusion" in args.run:
            run_starfusion(sample.name, cached_file_pairs, args)
Esempio n. 8
0
def main() -> None:
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s: %(message)s")

    p = argparse.ArgumentParser()
    p.add_argument('--cache_dir',
                   default=util.DEFAULT_CACHE_DIR,
                   help='Benchmark cache dir')
    p.add_argument('--result_dir',
                   default=util.DEFAULT_RESULT_DIR,
                   help='Benchmark result dir')
    p.add_argument(
        '--rerun_af4',
        action='store_true',
        help='Always run AF4 even if the result file already exists')
    p.add_argument(
        '--recache_files',
        action='store_true',
        help=
        'Always copy benchmark data files, even if they already exist locally.'
    )
    args = p.parse_args()
    util.s3_cache_files([
        util.REFERENCE_DIR +
        '/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa',
        util.REFERENCE_DIR + '/all_pair_art_lod_gpair_merged.txt',
        util.REFERENCE_DIR + '/liu_gpair.txt'
    ], args.cache_dir)
    for mode in ['denovo', 'targeted']:
        for sample in util.SIMULATED_SAMPLES:
            util.s3_cache_files([sample.path.r1, sample.path.r2],
                                args.cache_dir)
            result_dir = f'{args.result_dir}/synthetic-{mode}-{sample.n}-{sample.coverage}'
            try:
                os.makedirs(result_dir, 0o755)
            except:
                logging.error("mkdir %s failed", result_dir)
            if not os.path.exists(
                    f'{result_dir}/filtered.fa') or args.rerun_af4:
                logging.info('running benchmark in %s', result_dir)
                af4_args = [
                    str(util.af4_path()), f'-log_dir={result_dir}',
                    f'-r1={args.cache_dir}/{sample.path.r1}',
                    f'-r2={args.cache_dir}/{sample.path.r2}',
                    f'-fasta-output={result_dir}/all.fa',
                    f'-filtered-output={result_dir}/filtered.fa',
                    '-transcript=' + args.cache_dir +
                    '/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa'
                ]
                if mode == 'targeted':
                    af4_args.append('-cosmic-fusion=' + args.cache_dir +
                                    '/all_pair_art_lod_gpair_merged.txt')
                util.check_call(af4_args)
                logging.info("Runtime stats: %s",
                             util.run_stats(Path(result_dir)))

            stats = TargetedFusionStats(
                Path(f'{args.cache_dir}/liu_gpair.txt'),
                Path(f'{result_dir}/filtered.fa'))

            s = stats.stats()
            tp = "%d" % (s.tp, )
            fp = "%d" % (s.fp, )
            fn = "%d" % (s.fn, )
            print(
                f'{mode} & {sample.n} & {sample.coverage} & {tp} & {fp} & {fn}\\\\'
            )
Esempio n. 9
0
def main() -> None:
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s: %(message)s")

    p = argparse.ArgumentParser()
    p.add_argument("--cache_dir",
                   default=util.DEFAULT_CACHE_DIR,
                   help="Benchmark cache dir")
    p.add_argument("--result_dir",
                   default=util.DEFAULT_RESULT_DIR,
                   help="Benchmark result dir")
    p.add_argument(
        "--rerun_af4",
        action="store_true",
        help="Always run AF4 even if the result file already exists",
    )
    p.add_argument(
        "--recache_files",
        action="store_true",
        help=
        "Always copy benchmark data files, even if they already exist locally.",
    )
    args = p.parse_args()
    util.s3_cache_files(
        [
            util.REFERENCE_DIR +
            "/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa",
            util.REFERENCE_DIR + "/all_pair_art_lod_gpair_merged.txt",
            util.REFERENCE_DIR + "/liu_gpair.txt",
        ],
        args.cache_dir,
    )
    for mode in ["denovo", "targeted"]:
        for sample in util.SIMULATED_SAMPLES:
            util.s3_cache_files([sample.path.r1, sample.path.r2],
                                args.cache_dir)
            result_dir = (
                f"{args.result_dir}/synthetic-{mode}-{sample.n}-{sample.coverage}"
            )
            try:
                os.makedirs(result_dir, 0o755)
            except:
                logging.error("mkdir %s failed", result_dir)
            if not os.path.exists(
                    f"{result_dir}/filtered.fa") or args.rerun_af4:
                logging.info("running benchmark in %s", result_dir)
                af4_args = [
                    str(util.af4_path()),
                    f"-log_dir={result_dir}",
                    f"-r1={args.cache_dir}/{sample.path.r1}",
                    f"-r2={args.cache_dir}/{sample.path.r2}",
                    f"-fasta-output={result_dir}/all.fa",
                    f"-filtered-output={result_dir}/filtered.fa",
                    f"-max-genes-per-kmer=2",
                    f"-max-proximity-distance=1000",
                    f"-max-proximity-genes=5",
                    "-transcript=" + args.cache_dir +
                    "/gencode.v26.250padded_separate_jns_transcripts_parsed_no_mt_no_overlap_no_pary_no_versioned.fa",
                ]
                if mode == "targeted":
                    af4_args.append("-cosmic-fusion=" + args.cache_dir +
                                    "/all_pair_art_lod_gpair_merged.txt")
                util.check_call(af4_args)
                logging.info("Runtime stats: %s",
                             util.run_stats(Path(result_dir)))

            stats = TargetedFusionStats(
                Path(f"{args.cache_dir}/liu_gpair.txt"),
                Path(f"{result_dir}/filtered.fa"),
            )

            s = stats.stats()
            tp = "%d" % (s.tp, )
            fp = "%d" % (s.fp, )
            fn = "%d" % (s.fn, )
            print(
                f"{mode} & {sample.n} & {sample.coverage} & {tp} & {fp} & {fn}\\\\"
            )