def headers(samples, datasets): '''Statistics headers''' headers = ['Sample', 'Total reads', 'Mapped reads', 'Deduplicated reads'] splits_headers = set() for sample in samples: splits_headers.update([split[len(sample) + 1:] for split in Split.splits(sample)]) if datasets: for dataset in datasets: splits_headers.update([split[len(sample) + 1:] for split in Split.splits(sample)]) splits_headers = [header for header in splits_headers] splits_headers.sort(key=Split.splitkey) headers.extend(splits_headers) return (headers, splits_headers)
def sample_splits_prepgenomecov(sample): '''Prepare BED file used for genome coverage on a single sample.''' print('Compute genome coverage on sample {}'.format(sample)) prepare_genome_coverage_sample(sample) splits = sb.splits(sample) for split in splits: prepare_genome_coverage_sample(split)
def sample_splits_genome_coverage(sample, genome, scale=None, strand=None, input_suffix='', output_suffix='-cov', spike_suffix=None, control_suffix=None, genomecov_args=()): '''Compute genome coverage on a single sample.''' print ('Computing genome coverage on sample {}'.format(sample)) genome_coverage(sample, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args) splits = Split.splits(sample) for split in splits: genome_coverage(split, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args)
def center_annotations_sample_splits(sample, input_suffix='', output_suffix='-forcov'): '''Prepare BED file used for genome coverage on a single sample.''' print ('Center annotations on sample {}'.format(sample)) center_annotations_sample(sample, input_suffix, output_suffix) splits = sb.splits(sample) for split in splits: center_annotations_sample(split, input_suffix, output_suffix)
def dyad_coverage(samples, genes='genes.txt', selection=None, absolute=False, minp=-75, maxp=75, smoothing=None, suffix=None, index=None): '''Finds the distribution of ditances between fragments and dyad.''' genes_info = pd.read_csv(genes, sep='\t', comment='#') genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1] if selection: selection_genes = Parser.first(selection) genes_info = genes_info[genes_info[genes_info.columns[2]].isin( selection_genes)] sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: dyad_coverage_sample(sample, genes_info, absolute, minp, maxp, suffix, smoothing) splits = sb.splits(sample) for split in splits: dyad_coverage_sample(split, genes_info, absolute, minp, maxp, suffix, smoothing)
def fit_gaussian(samples='samples.txt', absolute=False, components=False, svg=False, verbose=False, center=None, cmin=None, cmax=None, amp=None, amin=None, sigma=None, smin=None, suffix=None, index=None): '''Fits gaussian curve to dyad coverage.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: fit_gaussian_sample(sample, absolute, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, suffix) splits = sb.splits(sample) for split in splits: fit_gaussian_sample(split, absolute, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, suffix)
def ignore_strand_sample_splits(sample, input_suffix='', output_suffix='-forcov'): '''Prepare BED file used for genome coverage on a single sample.''' print('Duplicate annotations on other strand on sample {}'.format(sample)) ignore_strand_sample(sample, input_suffix, output_suffix) splits = sb.splits(sample) for split in splits: ignore_strand_sample(split, input_suffix, output_suffix)
def test_splits_2(testdir, mock_testclass): sample = 'POLR2A' Path(sample + '-100-150.bed').touch() Path(sample + '-200-250.bed').touch() Path(sample + '-300-350.bed').touch() Path(sample + '-400-450.bed').touch() splits = s.splits(sample) assert splits[0] == sample + '-100-150' assert splits[1] == sample + '-200-250' assert splits[2] == sample + '-300-350' assert splits[3] == sample + '-400-450' assert len(splits) == 4
def test_splits(testdir, mock_testclass): sample = 'POLR2A' Path(sample + '-100-110.bed').touch() Path(sample + '-110-120.bed').touch() Path(sample + '-120-130.bed').touch() Path(sample + '-130-140.bed').touch() splits = s.splits(sample) assert splits[0] == sample + '-100-110' assert splits[1] == sample + '-110-120' assert splits[2] == sample + '-120-130' assert splits[3] == sample + '-130-140' assert len(splits) == 4
def dyadcov(samples, genes, minp, maxp, smoothing, index): '''Finds the distribution of ditances between fragments and dyad.''' logging.basicConfig(filename='debug.log', level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') genes_info = pd.read_csv(genes, sep='\t', comment='#') genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1] sample_names = pd.read_csv(samples, header=None, sep='\t', comment='#')[0] if index != None: sample_names = [sample_names[index]] for sample in sample_names: dyad_coverage_sample(sample, genes_info, minp, maxp, smoothing) splits = sb.splits(sample) for split in splits: dyad_coverage_sample(split, genes_info, minp, maxp, smoothing)
def fitgaussian(samples, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, index): '''Fits gaussian curve to dyad coverage.''' logging.basicConfig(filename='debug.log', level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') sample_names = pd.read_csv(samples, header=None, sep='\t', comment='#')[0] if index != None: sample_names = [sample_names[index]] for sample in sample_names: fitgaussian_sample(sample, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin) splits = sb.splits(sample) for split in splits: fitgaussian_sample(split, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin)
def fit_double_gaussian(samples='samples.txt', absolute=False, components=False, gaussian=False, svg=False, verbose=False, center1=None, cmin1=None, cmax1=None, amp1=None, amin1=None, sigma1=None, smin1=None, center2=None, cmin2=None, cmax2=None, amp2=None, amin2=None, sigma2=None, smin2=None, suffix=None, index=None): '''Fits double gaussian curve to dyad coverage.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: fit_double_gaussian_sample(sample, absolute, components, gaussian, svg, verbose, center1, cmin1, cmax1, amp1, amin1, sigma1, smin1, center2, cmin2, cmax2, amp2, amin2, sigma2, smin2, suffix) splits = sb.splits(sample) for split in splits: fit_double_gaussian_sample(split, absolute, components, gaussian, svg, verbose, center1, cmin1, cmax1, amp1, amin1, sigma1, smin1, center2, cmin2, cmax2, amp2, amin2, sigma2, smin2, suffix)
def test_splits_none(testdir, mock_testclass): sample = 'POLR2A' splits = s.splits(sample) assert len(splits) == 0