Ejemplo n.º 1
0
def dyad_coverage(samples,
                  genes='genes.txt',
                  selection=None,
                  absolute=False,
                  minp=-75,
                  maxp=75,
                  smoothing=None,
                  suffix=None,
                  index=None):
    '''Finds the distribution of ditances between fragments and dyad.'''
    genes_info = pd.read_csv(genes, sep='\t', comment='#')
    genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1]
    if selection:
        selection_genes = Parser.first(selection)
        genes_info = genes_info[genes_info[genes_info.columns[2]].isin(
            selection_genes)]
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        dyad_coverage_sample(sample, genes_info, absolute, minp, maxp, suffix,
                             smoothing)
        splits = sb.splits(sample)
        for split in splits:
            dyad_coverage_sample(split, genes_info, absolute, minp, maxp,
                                 suffix, smoothing)
Ejemplo n.º 2
0
def statistics_samples(samples='samples.txt', datasets='dataset.txt', output='statistics.txt'):
    '''Creates statistics file for samples.'''
    sample_names = Parser.first(samples)
    datasets_names = []
    if os.path.exists(datasets):
        datasets_names = Parser.first(datasets)
    compute_statistics(sample_names, datasets_names, output)
Ejemplo n.º 3
0
def fit_gaussian(samples='samples.txt',
                 absolute=False,
                 components=False,
                 svg=False,
                 verbose=False,
                 center=None,
                 cmin=None,
                 cmax=None,
                 amp=None,
                 amin=None,
                 sigma=None,
                 smin=None,
                 suffix=None,
                 index=None):
    '''Fits gaussian curve to dyad coverage.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        fit_gaussian_sample(sample, absolute, components, svg, verbose, center,
                            cmin, cmax, amp, amin, sigma, smin, suffix)
        splits = sb.splits(sample)
        for split in splits:
            fit_gaussian_sample(split, absolute, components, svg, verbose,
                                center, cmin, cmax, amp, amin, sigma, smin,
                                suffix)
Ejemplo n.º 4
0
def removesecondmate_samples(samples='samples.txt', input_suffix='-dedup', output_suffix='-mate1', threads=None, index=None):
    '''Removes second mate from BAM.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        removesecondmate_sample(sample, input_suffix, output_suffix, threads)
Ejemplo n.º 5
0
def split_samples(samples='samples.txt', index=None, binlength=10, binminlength=100, binmaxlength=500):
    '''Split BED files from samples based on lenght of annotations.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        split_sample(sample, binlength, binminlength, binmaxlength)
Ejemplo n.º 6
0
def genome_coverage_samples(samples='samples.txt', genome='sacCer3.chrom.sizes', scale=None, strand=None, input_suffix='', output_suffix='-cov', spike_suffix=None, control_suffix=None, index=None, genomecov_args=()):
    '''Compute genome coverage on samples.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        sample_splits_genome_coverage(sample, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args)
Ejemplo n.º 7
0
def center_annotations_samples(samples='samples.txt', input_suffix='', output_suffix='-forcov', index=None):
    '''Prepare BED file used for genome coverage on samples.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        center_annotations_sample_splits(sample, input_suffix, output_suffix)
Ejemplo n.º 8
0
def intersectannotations(input, annotations, output):
    '''Filter BED file to keep only annotations present in annotations.'''
    logging.basicConfig(filename='seqtools.log',
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    genes = Parser.first(annotations)
    incolumns = Parser.columns(input)
    with open(output, 'w') as outfile:
        for columns in incolumns:
            if columns[3] in genes:
                outfile.write(str(columns[0]))
                for column in columns[1:]:
                    outfile.write('\t')
                    outfile.write(str(column))
                outfile.write('\n')
Ejemplo n.º 9
0
def plot2do_samples(file, input_suffix='', index=None, plot2do_args=()):
    '''Run plot2DO on samples.'''
    file_parent = Path(file).parent
    sample_names = Parser.first(file)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        plot2do_sample(str(file_parent / sample), input_suffix, plot2do_args)
Ejemplo n.º 10
0
def test_columns():
    samples = Path(__file__).parent.parent.joinpath('samples.txt')
    columns = p.columns(samples)
    assert columns[0][0] == 'POLR2A'
    assert columns[0][1] == 'SRR8518913'
    assert columns[1][0] == 'ASDURF'
    assert columns[1][1] == 'SRX5322424'
    assert columns[2][0] == 'POLR1C'
    assert columns[2][1] == 'SRR8518915'
Ejemplo n.º 11
0
def merge_datasets(datasets='dataset.txt', index=None):
    '''Merge BED files related to samples.'''
    datasets_columns = Parser.columns(datasets)
    if index != None:
        datasets_columns = [datasets_columns[index]]
    for columns in datasets_columns:
        name = columns[0]
        samples = [sample for sample in columns[1:]]
        merge_dataset(name, samples)
Ejemplo n.º 12
0
def bowtie_samples(samples='samples.txt',
                   threads=None,
                   output_suffix='',
                   index=None,
                   bowtie_args=()):
    '''Align samples using bowtie2 program.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        bowtie_sample(sample, threads, output_suffix, bowtie_args)
Ejemplo n.º 13
0
def merge_datasets(datasets='dataset.txt',
                   sizes='sacCer3.chrom.sizes',
                   index=None):
    '''Merge bigWig files related to samples.'''
    datasets_columns = Parser.columns(datasets)
    if index != None:
        datasets_columns = [datasets_columns[index]]
    for columns in datasets_columns:
        name = columns[0]
        samples = [sample for sample in columns[1:]]
        merge_dataset(name, samples, sizes)
Ejemplo n.º 14
0
def bwa_samples(samples='samples.txt',
                fasta='sacCer3.fa',
                threads=None,
                output_suffix='',
                index=None,
                bwa_args=()):
    '''Align samples using bwa program.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        bwa_sample(sample, fasta, threads, output_suffix, bwa_args)
Ejemplo n.º 15
0
def shift_annotations_samples(samples='samples.txt',
                              input_suffix='',
                              output_suffix='-forcov',
                              index=None,
                              bedtools_args=()):
    '''Moves annotations contained in BED files.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        shift_annotations_sample(sample, input_suffix, output_suffix,
                                 bedtools_args)
Ejemplo n.º 16
0
def test_columns_merge():
    samples = Path(__file__).parent.parent.joinpath('dataset.txt')
    columns = p.columns(samples)
    assert columns[0][0] == 'POLR2A'
    assert columns[0][1] == 'POLR2A_1'
    assert columns[0][2] == 'POLR2A_2'
    assert columns[1][0] == 'ASDURF'
    assert columns[1][1] == 'ASDURF_1'
    assert columns[1][2] == 'ASDURF_2'
    assert columns[2][0] == 'POLR1C'
    assert columns[2][1] == 'POLR1C_1'
    assert columns[2][2] == 'POLR1C_2'
Ejemplo n.º 17
0
def download_samples(samples='samples.txt',
                     fast=True,
                     threads=None,
                     mem='100MB',
                     index=None):
    '''Download reads of all samples.'''
    sample_columns = Parser.columns(samples)
    if index != None:
        sample_columns = [sample_columns[index]]
    for columns in sample_columns:
        sample = columns[0]
        srr = columns[1] if len(columns) > 1 else None
        download_sample(sample, srr, fast, threads, mem)
Ejemplo n.º 18
0
def filter_bam(samples='samples.txt',
               paired=True,
               dedup=True,
               threads=None,
               input_suffix='',
               output_suffix='',
               index=None):
    '''Filter BAM file to keep only properly paired reads and remove supplementary alignments and duplicates.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        filter_bam_sample(sample, paired, dedup, threads, input_suffix,
                          output_suffix)
Ejemplo n.º 19
0
def fit_double_gaussian(samples='samples.txt',
                        absolute=False,
                        components=False,
                        gaussian=False,
                        svg=False,
                        verbose=False,
                        center1=None,
                        cmin1=None,
                        cmax1=None,
                        amp1=None,
                        amin1=None,
                        sigma1=None,
                        smin1=None,
                        center2=None,
                        cmin2=None,
                        cmax2=None,
                        amp2=None,
                        amin2=None,
                        sigma2=None,
                        smin2=None,
                        suffix=None,
                        index=None):
    '''Fits double gaussian curve to dyad coverage.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        fit_double_gaussian_sample(sample, absolute, components, gaussian, svg,
                                   verbose, center1, cmin1, cmax1, amp1, amin1,
                                   sigma1, smin1, center2, cmin2, cmax2, amp2,
                                   amin2, sigma2, smin2, suffix)
        splits = sb.splits(sample)
        for split in splits:
            fit_double_gaussian_sample(split, absolute, components, gaussian,
                                       svg, verbose, center1, cmin1, cmax1,
                                       amp1, amin1, sigma1, smin1, center2,
                                       cmin2, cmax2, amp2, amin2, sigma2,
                                       smin2, suffix)
Ejemplo n.º 20
0
def merge_dataset(name, samples, sizes):
    '''Merge bigWig files related to samples.'''
    print('Merging samples {} into dataset {}'.format(samples, name))
    sizes_columns = Parser.columns(sizes)
    bws = [pbw.open(sample + '.bw') for sample in samples]
    merge_temp_o, merge_temp = tempfile.mkstemp(suffix='.bed')
    with open(merge_temp_o, 'w') as output:
        output.write('track type=bedGraph name="' + name + '"\n')
        for size_columns in sizes_columns:
            chromosome = size_columns[0]
            size = size_columns[1]
            sums = [0] * size
            for bw in bws:
                bw_size = bw.chroms(chromosome) if bw.chroms(chromosome) else 0
                if bw_size == 0:
                    continue
                values = bw.values(chromosome, 0, min(size, bw_size))
                sums = [
                    sums[i] + (values[i] if not math.isnan(values[i]) else 0)
                    for i in range(0, min(size, bw_size))
                ]
            for i in range(0, len(sums)):
                output.write(chromosome)
                output.write('\t')
                output.write(str(i))
                output.write('\t')
                output.write(str(i + 1))
                output.write('\t')
                output.write(str(sums[i]))
                output.write('\n')
    sort_temp_o, sort_temp = tempfile.mkstemp(suffix='.bed')
    Bed.sort(merge_temp, sort_temp)
    merged_bw = name + '.bw'
    Bed.bedgraph_to_bigwig(sort_temp, merged_bw, sizes)
    os.remove(sort_temp)
    os.remove(merge_temp)
Ejemplo n.º 21
0
def test_first_merge():
    samples = Path(__file__).parent.parent.joinpath('dataset.txt')
    names = p.first(samples)
    assert names == ['POLR2A', 'ASDURF', 'POLR1C']