コード例 #1
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def fastq_file(s):
    s = s if ',' in s else f'{s},'
    if not len(s) == 2:
        logger.error(
            f'Invalid FASTQ file(s) specified, {len(s)} files were given while only accepts 1 or 2 files.'
        )
        sys.exit(1)
    return [file_path(p) if p else p for p in s.split(',')]
コード例 #2
0
ファイル: eclip_rescue.py プロジェクト: VanNostrandLab/eclip
def rescue_ratio(bed, txt):
    pseudo_count = count_lines(bed)
    actual_count = count_lines(os.path.join(tmp, f'{key}.ip.pseudo.01.vs.{key}.ip.pseudo.02.reproducible.peaks.bed'))
    try:
        ratio = max(actual_count, pseudo_count) / min(actual_count, pseudo_count)
    except ZeroDivisionError:
        ratio = 0
        logger.error(f'No peaks found in reproducible peaks or pseudo reproducible peaks, return ratio 0.')
    with open(txt, 'w') as o:
        o.write(f'{ratio}\n')
コード例 #3
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def rescue_ratio(inputs, outputs):
    def prepare_pseudo_bam(bam1, bam2, basename):
        pseudo_bam = f'{basename}.bam'
        tmp_pseudo_bam = pseudo_bam.replace('.bam', '.tmp.bam')
        cmd = f'samtools merge {tmp_pseudo_bam} {bam1} {bam2}'
        cmder.run(cmd, msg=f'Merging {bam1} and {bam2} ...')

        cmder.run(
            f'samtools sort -@ {options.cores} -m 2G -o {pseudo_bam} {tmp_pseudo_bam}'
        )
        cmder.run(f'rm {tmp_pseudo_bam}')

        bam1, bam2 = split_bam(pseudo_bam, f'{basename}.pseudo.01.bam',
                               f'{basename}.pseudo.02.bam')
        return bam1, bam2

    pseudo_ip_bams, pseudo_input_bams, pseudo_peak_beds = [], [], []
    for i, (sample1, sample2) in enumerate(itertools.combinations(SAMPLES, 2),
                                           start=1):
        pseudo_ip_bam = prepare_pseudo_bam(
            sample1.ip_read.bam, sample2.ip_read.bam,
            f'rescue/{sample1.ip_read.name}.{sample2.ip_read.name}')
        pseudo_ip_bams.extend(pseudo_ip_bam)

        pseudo_input_bam = prepare_pseudo_bam(
            sample1.input_read.bam, sample2.input_read.bam,
            f'rescue/{sample1.input_read.name}.{sample2.input_read.name}')
        pseudo_input_bams.extend(pseudo_input_bam)

        pseudo_peak_beds.extend([clipper_peaks(bam) for bam in pseudo_ip_bam])

    key = ".".join([sample.ip_read.name for sample in SAMPLES])
    pseudo_reproducible_bed = f'rescue/{key}.pseudo.01.vs.{key}.pseudo.02.reproducible.peaks.bed'
    peak(pseudo_ip_bams,
         pseudo_input_bams,
         pseudo_peak_beds,
         pseudo_reproducible_bed,
         'rescue',
         cwd=options.outdir)
    pseudo_count = count_lines(pseudo_reproducible_bed)

    key = ".vs.".join([sample.ip_read.name for sample in SAMPLES])
    count = count_lines(f'{ECLIP}/{key}.reproducible.peaks.bed')
    try:
        ratio = max(count, pseudo_count) / min(count, pseudo_count)
    except ZeroDivisionError:
        ratio = 0
        logger.error(
            f'No peaks found in reproducible peaks or pseudo reproducible peaks, return ratio 0.'
        )
    with open(outputs,
              'w') as o1, open(outputs.replace(f'{ECLIP}/', 'rescue/'),
                               'w') as o2:
        o1.write(f'{ratio}\n')
        o2.write(f'{ratio}\n')
コード例 #4
0
def rescue_ratio(bed, txt):
    bed1, bed2 = glob.glob(os.path.join(tmp, '*.reproducible.peaks.bed'))
    count1, count2 = count_lines(bed1), count_lines(bed2)
    try:
        ratio = count1 / count2
    except ZeroDivisionError:
        ratio = 0
        logger.error(
            f'No peaks found in one of the split reproducible peaks, return ratio 0.'
        )
    with open(txt, 'w') as o:
        o.write(f'{ratio}\n')
コード例 #5
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def rescue_ratio(inputs, txt):
    if len(SAMPLES) == 1:
        logger.warning(
            'No enough samples (n = 1 < 2) to calculate rescue ratio!')
        shutil.rmtree('rescue')
        return ''
    ip_bams, input_bams = [s.ip_bam
                           for s in SAMPLES], [s.input_bam for s in SAMPLES]
    ip_pseudo_bam = merge_bam(ip_bams, os.path.join('rescue', 'ip.pseudo.bam'))
    ip_pseudo_bams = split_bam(ip_pseudo_bam,
                               os.path.join('rescue', 'ip.pseudo.'),
                               len(ip_bams))
    os.unlink(ip_pseudo_bam)

    input_pseudo_bam = merge_bam(input_bams,
                                 os.path.join('rescue', 'input.pseudo.bam'))
    input_pseudo_bams = split_bam(input_pseudo_bam,
                                  os.path.join('rescue', 'input.pseudo.'),
                                  len(input_bams))
    os.unlink(input_pseudo_bam)

    pseudo_peak_beds = [
        clipper_peaks(bam, bam.replace('.bam', '.peak.clusters.bed'))
        for bam in ip_pseudo_bams
    ]
    basename = ".vs.".join(
        [os.path.basename(bam).replace('.bam', '') for bam in ip_pseudo_bams])
    pseudo_peak_bed = os.path.join('rescue',
                                   f'{basename}.reproducible.peaks.bed')
    peak(ip_pseudo_bams, input_pseudo_bams, pseudo_peak_beds, pseudo_peak_bed,
         'rescue')

    pseudo_count = count_lines(pseudo_peak_bed)
    basename = ".vs.".join([f'{name}.ip' for name in options.names])
    actual_count = count_lines(f'{basename}.reproducible.peaks.bed')
    try:
        ratio = max(actual_count, pseudo_count) / min(actual_count,
                                                      pseudo_count)
    except ZeroDivisionError:
        ratio = 0
        logger.error(
            f'No peaks found in reproducible peaks or pseudo reproducible peaks, return ratio 0.'
        )
    with open(txt, 'w') as o:
        o.write(f'{ratio}\n')
コード例 #6
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def consistency_ratio(inputs, txt):
    if len(SAMPLES) == 1:
        logger.warning(
            'No enough samples (n = 1 < 2) to calculate self-consistency ratio!'
        )
        shutil.rmtree('consistency')
        return ''
    ip_bam1, ip_bam2, input_bam1, input_bam2, peak_bed1, peak_bed2 = [], [], [], [], [], []
    for s in SAMPLES:
        ip_b1, ip_b2 = split_bam(
            s.ip_bam, os.path.join('consistency', f'{s.name}.ip.split.'), 2)
        ip_bam1.append(ip_b1), ip_bam2.append(ip_b2)
        input_b1, input_b2 = split_bam(
            s.input_bam, os.path.join('consistency', f'{s.name}.input.split.'),
            2)
        input_bam1.append(input_b1), input_bam2.append(input_b2)
        bed1 = clipper_peaks(ip_b1, ip_b1.replace('.bam',
                                                  '.peak.clusters.bed'))
        bed2 = clipper_peaks(ip_b2, ip_b2.replace('.bam',
                                                  '.peak.clusters.bed'))
        peak_bed1.append(bed1), peak_bed2.append(bed2)

    basename = ".vs.".join(
        [os.path.basename(bam).replace('.bam', '') for bam in ip_bam1])
    split_peak_bed1 = os.path.join('consistency',
                                   f'{basename}.reproducible.peaks.bed')
    peak(ip_bam1, input_bam1, peak_bed1, split_peak_bed1, 'consistency')

    basename = ".vs.".join(
        [os.path.basename(bam).replace('.bam', '') for bam in ip_bam2])
    split_peak_bed2 = os.path.join('consistency',
                                   f'{basename}.reproducible.peaks.bed')
    peak(ip_bam2, input_bam2, peak_bed2, split_peak_bed2, 'consistency')

    count1, count2 = count_lines(split_peak_bed1), count_lines(split_peak_bed2)
    try:
        ratio = count1 / count2
    except ZeroDivisionError:
        ratio = 0
        logger.error(
            f'No peaks found in one of the split reproducible peaks, return ratio 0.'
        )
    with open(txt, 'w') as o:
        o.write(f'{ratio}\n')
コード例 #7
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def make_bigwig_files(bam, bigwig):
    def bam_to_bigwig(bam, scale, strand, bw):
        bg, bg_sort = bw.replace('.bw', '.bg'), bw.replace('.bw', '.sort.bg')
        cmd = f'genomeCoverageBed -ibam {bam} -bg -scale {scale} -strand {strand} -du -split > {bg}'
        cmder.run(cmd)
        cmd = f'bedSort {bg} {bg_sort}'
        cmder.run(cmd)
        cmd = f'bedGraphToBigWig {bg_sort} {options.genome}/chrNameLength.txt {bw}'
        cmder.run(cmd)
        cmder.run(f'rm {bg} {bg_sort}')

    message, start_time = f'Make BigWig files for {bam} ...', time.perf_counter(
    )
    logger.info(message)
    pos_bw, neg_bw = bigwig, bigwig.replace('.plus.bw', '.minus.bw')
    with pysam.AlignmentFile(bam, 'rb') as sam:
        total_reads = sam.mapped
    total_reads = total_reads / 2 if TYPE == 'paired' else total_reads
    try:
        scale = 1000000.0 / total_reads
    except ZeroDivisionError:
        logger.error(
            f'No reads was found in BAM {bam}, empty BigWig file was created.')
        with open(bigwig, 'w') as o:
            o.write('')
        return bigwig
    if TYPE == 'single':
        bam_to_bigwig(bam, scale, '+', pos_bw)
        bam_to_bigwig(bam, -1 * scale, '-', neg_bw)
    else:
        bam_to_bigwig(bam, -1 * scale, '-', pos_bw)
        bam_to_bigwig(bam, scale, '+', neg_bw)
    run_time = int(time.perf_counter() - start_time)
    message = message.replace(
        ' ...',
        f' completed in [{str(datetime.timedelta(seconds=run_time))}].')
    logger.info(message)
    return bigwig
コード例 #8
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def pureclip(bam, bed):
    ip_bam, input_bam = [[sample.ip_bam, sample.input_bam]
                         for sample in SAMPLES if sample.cross_bed == bed][0]
    header = cmder.run(f'samtools view -H {ip_bam}').stdout.read()
    refs = [
        line.split()[1].replace('SN:', '') for line in header.splitlines()
        if line.startswith('@SQ')
    ][:3]
    refs = ';'.join(refs)
    cmd = [
        'pureclip', '-i', ip_bam, '-bai', f'{ip_bam}.bai', '-g',
        f'{options.genome}/genome.fa', '-nt', options.cpus, '-ibam', input_bam,
        '-ibai', f'{input_bam}.bai', '-iv', f'"{refs};"', '-o', bed, '-or',
        bed.replace('.crosslink.sites.bed', '.binding.regions.bed'), '>',
        bed.replace('.crosslink.sites.bed', '.pureclip.log')
    ]
    try:
        cmder.run(
            cmd,
            msg=
            f'Calling peaks from {ip_bam} and {input_bam} using pureCLIP ...')
    except Exception as e:
        logger.error(f'Running pureclip failed: {e}.')
コード例 #9
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
def consistency_ratio(inputs, outputs):
    counts = []
    for i, sample in enumerate(SAMPLES, start=1):
        split_ip_bams = split_bam(
            sample.ip_read.bam,
            f'consistency/{sample.ip_read.name}.split.01.bam',
            f'consistency/{sample.ip_read.name}.split.02.bam')
        split_input_bams = split_bam(
            sample.input_read.bam,
            f'consistency/{sample.input_read.name}.split.01.bam',
            f'consistency/{sample.input_read.name}.split.02.bam')
        split_peak_beds = [
            clipper_peaks(split_ip_bams[0]),
            clipper_peaks(split_ip_bams[1])
        ]

        bed = f'consistency/{sample.ip_read.name}.split.01.vs.{sample.ip_read.name}.split.02.reproducible.peaks.bed'
        peak(split_ip_bams,
             split_input_bams,
             split_peak_beds,
             bed,
             'consistency',
             cwd=options.outdir)
        counts.append(count_lines(bed))

    try:
        ratio = counts[0] / counts[1]
    except ZeroDivisionError:
        ratio = 0
        logger.error(
            f'No peaks found in one of the split reproducible peaks, return ratio 0.'
        )
    with open(outputs,
              'w') as o1, open(outputs.replace(f'{ECLIP}/', 'consistency/'),
                               'w') as o2:
        o1.write(f'{ratio}\n')
        o2.write(f'{ratio}\n')
コード例 #10
0
ファイル: eclip_rescue.py プロジェクト: VanNostrandLab/eclip
parser = argparse.ArgumentParser(description=__doc__, prog='eclip')
parser.add_argument('--names', nargs='+', required=True, help='Shortnames for each sample, e.g., rep1, rep2.')
parser.add_argument('--wd', required=True, help='Path to the work directory that contains eCLIP analysis results.')
parser.add_argument('--species', help="Species name (short name code) the dataset associated with, e.g., hg19, mm10.",
                    default='hg19')
parser.add_argument('--l2fc', type=int, help="Only consider peaks at or above this log2 fold change cutoff.", default=3)
parser.add_argument('--l10p', type=int, help="Only consider peaks at or above this log10 p value cutoff.", default=3)
parser.add_argument('--cpus', type=int, help='Maximum number of CPU cores can be used for your job.', default=16)
parser.add_argument('--dry_run', action='store_true',
                    help='Print out steps and files involved in each step without actually running the pipeline.')
options = parser.parse_args()
try:
    os.chdir(options.wd)
except OSError as e:
    logger.error(e)
tmp = 'rescue'
if not os.path.isdir(tmp):
    os.mkdir(tmp)
ip_bams = [f'{name}.ip.bam' for name in options.names]
input_bams = [f'{name}.input.bam' for name in options.names]
files = {}
for name1, name2 in itertools.combinations(options.names, 2):
    files[f'{name1}.{name2}'] = (f'{name1}.ip.bam', f'{name2}.ip.bam', f'{name1}.input.bam', f'{name2}.input.bam')
key = '.'.join(options.names)


def merge_bam(bam1, bam2, bam):
    if os.path.isfile(bam):
        logger.info(f'BAM file {bam} already exist.')
    else:
コード例 #11
0
def dir_path(p):
    if not os.path.isdir(p):
        logger.error(f'Path "{p}" may not be a directory or does not exist.')
        sys.exit(1)
    return p
コード例 #12
0
def file_path(p):
    if not os.path.isfile(p):
        logger.error(f'File "{p}" may not be a file or does not exist.')
        sys.exit(1)
    return p
コード例 #13
0
parser.add_argument(
    '--hold_submit',
    action='store_true',
    help=
    'Generate the submit script but hold it without submitting to the job scheduler.'
)

args = parser.parse_args()
outdir = args.outdir or os.getcwd()
dir_path(outdir)
os.chdir(outdir)

fastq, adapters_fasta, name = args.fastq, args.adapters_fasta, args.name
rtag, gtag = args.repeat_label, args.genome_label
if len(fastq) != len(adapters_fasta):
    logger.error('Number of items for fastq and adapters_fasta are not equal.')
    sys.exit(1)
if name:
    if not len(args.fastq) != len(args.name):
        logger.error('Number of items for fastq and name are not equal.')
    sys.exit(1)
else:
    name = [basename(n) for n in fastq]
name = [os.path.join(outdir, n) for n in name]
fastq_to_name = {fq: n for fq, n in zip(fastq, name)}
fastq_to_adapters = {n: adapter for n, adapter in zip(name, adapters_fasta)}


@task(inputs=fastq,
      outputs=lambda i: f'{fastq_to_name[i]}.umi.fastq.gz',
      cmd=['eclip_umi_extract', 'input', '-o', 'output'],
コード例 #14
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
parser.add_argument(
    '--dry_run',
    action='store_true',
    help=
    'Print out steps and files involved in each step without actually running the pipeline.'
)

START_TIME = time.perf_counter()

options = parser.parse_args()
setattr(options, 'outdir', options.outdir or os.getcwd())
if not os.path.isdir(options.outdir):
    try:
        os.mkdir(options.outdir)
    except OSError as e:
        logger.error(f'Create outdir failed: {e}.')
        sys.exit(1)
os.chdir(options.outdir)

adapters = '/storage/vannostrand/software/eclip/data/se.adapters.fasta'
setattr(options, 'adapters_fasta', options.adapters_fasta or adapters)
file_path(options.adapters_fasta)

setattr(
    options, 'repeat', options.repeat
    or '/storage/vannostrand/reference_data/hg19/repbase_v2_star_index')
dir_path(options.repeat)

setattr(
    options, 'genome', options.genome
    or '/storage/vannostrand/reference_data/hg19/genome_star_index')
コード例 #15
0
ファイル: eclip.py プロジェクト: VanNostrandLab/eclip
START_TIME = time.perf_counter()

options = parser.parse_args()
setattr(options, 'outdir', options.outdir or os.getcwd())
dir_path(options.outdir)
os.chdir(options.outdir)

ips, inputs, names = options.ip_fastqs, options.input_fastqs, options.labels
if len(ips) == len(names):
    if len(ips) == len(inputs):
        input_type = 'single-input'
    else:
        input_type = 'multiple-inputs'
        if len(inputs) != 1:
            logger.error('Wrong number of input_fastqs were provided.')
            sys.exit(1)
else:
    logger.error('Number of items in ip_fastqs and names are not equal.')
    sys.exit(1)


class Read:
    def __init__(self, fastq1, fastq2, read_name, read_type):
        self.fastq1 = fastq1
        self.fastq2 = fastq2
        self.read_name = read_name
        self.read_type = read_type
        self.key = read_name if read_name else fastq1.replace(
            '.fastq.gz', '').replace('.fq.gz', '')
        self.paired = True if self.fastq2 else False
コード例 #16
0
parser = argparse.ArgumentParser(description=__doc__, prog='se_fastq_to_bam')
parser.add_argument('--fastq', required=True, help='Path to a UMI extracted FASTQ file.', type=file_path)
parser.add_argument('--bam', required=True,  help='Path to the output BAM file (must ends with .bam).')
parser.add_argument('--adapters_fasta', help="Path to the fasta file contains adapters and their sequences (for "
                                             "single-end dataset only.", required=True,  type=file_path)
parser.add_argument('--genome', help="Path to STAR reference genome index directory.", type=dir_path)
parser.add_argument('--repeat', help="Path to STAR repeat elements index directory.", type=dir_path)
parser.add_argument('--cpus', type=int, help='Maximum number of CPU cores can be used for your job.', default=16)
parser.add_argument('--dry_run', action='store_true',
                    help='Print out steps and files involved in each step without actually running the pipeline.')


args = parser.parse_args()
fastq, bam = args.fastq, args.bam
if not bam.endswith('.bam'):
    logger.error(f'Output BAM file "{bam}" does not end with .bam extension.')
    sys.exit(1)
name = args.name if args.name else bam.replace('.bam', '')
outdir = os.path.dirname(bam) or os.getcwd()
if not os.path.isdir(outdir):
    logger.error(f'Cane not set "{outdir}" as output directory.')
    sys.exit(1)
os.chdir(options.outdir)

ips, inputs, names = options.ip_fastqs, options.input_fastqs, options.labels
if len(ips) == len(inputs) == len(names):
    pass
else:
    logger.error('Number of items in ip_fastqs, input_fastqs, and names are not equal.')
    sys.exit(1)