def test_split_sample(testdir, mock_testclass): sample = 'POLR2A' bed = sample + '.bed' sort = sample + '-sort.bed' sort_copy = Path(__file__).parent.joinpath('sample-split.bed') copyfile(sort_copy, sort) Bed.sort_bysize = MagicMock(side_effect=create_file_sort) Bed.sort = MagicMock() os_remove = os.remove os.remove = MagicMock() binlength = 10 binminlength = 100 binmaxlength = 130 s.split_sample(sample, binlength, binminlength, binmaxlength) Bed.sort_bysize.assert_called_once_with(bed, ANY) Bed.sort.assert_any_call(ANY, sample + '-100-110.bed') Bed.sort.assert_any_call(ANY, sample + '-110-120.bed') Bed.sort.assert_any_call(ANY, sample + '-120-130.bed') with open(Bed.sort.call_args_list[0].args[0], 'r') as infile: assert infile.readline() == 'chr4\t800\t900\ttest4\t4\t+\n' assert infile.readline() == '' with open(Bed.sort.call_args_list[1].args[0], 'r') as infile: assert infile.readline() == 'chr8\t800\t910\ttest8\t4\t-\n' assert infile.readline() == '' with open(Bed.sort.call_args_list[2].args[0], 'r') as infile: assert infile.readline() == 'chr5\t100\t220\ttest5\t1\t-\n' assert infile.readline() == 'chr1\t100\t229\ttest1\t1\t+\n' assert infile.readline() == '' for remove_args in os.remove.call_args_list: os_remove(remove_args.args[0])
def test_split_samples(testdir, mock_testclass): samples = Path(__file__).parent.joinpath('samples.txt') s.split_sample = MagicMock() s.split_samples(samples) s.split_sample.assert_any_call('POLR2A', 10, 100, 500) s.split_sample.assert_any_call('ASDURF', 10, 100, 500) s.split_sample.assert_any_call('POLR1C', 10, 100, 500)
def test_split_samples_second_parameters(testdir, mock_testclass): samples = Path(__file__).parent.joinpath('samples.txt') s.split_sample = MagicMock() binlength = 20 binminlength = 200 binmaxlength = 400 s.split_samples(samples, 1, binlength, binminlength, binmaxlength) s.split_sample.assert_called_once_with('ASDURF', binlength, binminlength, binmaxlength)
def headers(samples, datasets): '''Statistics headers''' headers = ['Sample', 'Total reads', 'Mapped reads', 'Deduplicated reads'] splits_headers = set() for sample in samples: splits_headers.update([split[len(sample) + 1:] for split in Split.splits(sample)]) if datasets: for dataset in datasets: splits_headers.update([split[len(sample) + 1:] for split in Split.splits(sample)]) splits_headers = [header for header in splits_headers] splits_headers.sort(key=Split.splitkey) headers.extend(splits_headers) return (headers, splits_headers)
def fit_gaussian(samples='samples.txt', absolute=False, components=False, svg=False, verbose=False, center=None, cmin=None, cmax=None, amp=None, amin=None, sigma=None, smin=None, suffix=None, index=None): '''Fits gaussian curve to dyad coverage.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: fit_gaussian_sample(sample, absolute, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, suffix) splits = sb.splits(sample) for split in splits: fit_gaussian_sample(split, absolute, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, suffix)
def test_split_samples_parameters(testdir, mock_testclass): samples = Path(__file__).parent.joinpath('samples.txt') s.split_sample = MagicMock() binlength = 20 binminlength = 200 binmaxlength = 400 s.split_samples(samples, binlength=binlength, binminlength=binminlength, binmaxlength=binmaxlength) s.split_sample.assert_any_call('POLR2A', binlength, binminlength, binmaxlength) s.split_sample.assert_any_call('ASDURF', binlength, binminlength, binmaxlength) s.split_sample.assert_any_call('POLR1C', binlength, binminlength, binmaxlength)
def center_annotations_sample_splits(sample, input_suffix='', output_suffix='-forcov'): '''Prepare BED file used for genome coverage on a single sample.''' print ('Center annotations on sample {}'.format(sample)) center_annotations_sample(sample, input_suffix, output_suffix) splits = sb.splits(sample) for split in splits: center_annotations_sample(split, input_suffix, output_suffix)
def dyad_coverage(samples, genes='genes.txt', selection=None, absolute=False, minp=-75, maxp=75, smoothing=None, suffix=None, index=None): '''Finds the distribution of ditances between fragments and dyad.''' genes_info = pd.read_csv(genes, sep='\t', comment='#') genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1] if selection: selection_genes = Parser.first(selection) genes_info = genes_info[genes_info[genes_info.columns[2]].isin( selection_genes)] sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: dyad_coverage_sample(sample, genes_info, absolute, minp, maxp, suffix, smoothing) splits = sb.splits(sample) for split in splits: dyad_coverage_sample(split, genes_info, absolute, minp, maxp, suffix, smoothing)
def sample_splits_prepgenomecov(sample): '''Prepare BED file used for genome coverage on a single sample.''' print('Compute genome coverage on sample {}'.format(sample)) prepare_genome_coverage_sample(sample) splits = sb.splits(sample) for split in splits: prepare_genome_coverage_sample(split)
def sample_splits_genome_coverage(sample, genome, scale=None, strand=None, input_suffix='', output_suffix='-cov', spike_suffix=None, control_suffix=None, genomecov_args=()): '''Compute genome coverage on a single sample.''' print ('Computing genome coverage on sample {}'.format(sample)) genome_coverage(sample, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args) splits = Split.splits(sample) for split in splits: genome_coverage(split, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args)
def ignore_strand_sample_splits(sample, input_suffix='', output_suffix='-forcov'): '''Prepare BED file used for genome coverage on a single sample.''' print('Duplicate annotations on other strand on sample {}'.format(sample)) ignore_strand_sample(sample, input_suffix, output_suffix) splits = sb.splits(sample) for split in splits: ignore_strand_sample(split, input_suffix, output_suffix)
def test_splits_2(testdir, mock_testclass): sample = 'POLR2A' Path(sample + '-100-150.bed').touch() Path(sample + '-200-250.bed').touch() Path(sample + '-300-350.bed').touch() Path(sample + '-400-450.bed').touch() splits = s.splits(sample) assert splits[0] == sample + '-100-150' assert splits[1] == sample + '-200-250' assert splits[2] == sample + '-300-350' assert splits[3] == sample + '-400-450' assert len(splits) == 4
def test_splits(testdir, mock_testclass): sample = 'POLR2A' Path(sample + '-100-110.bed').touch() Path(sample + '-110-120.bed').touch() Path(sample + '-120-130.bed').touch() Path(sample + '-130-140.bed').touch() splits = s.splits(sample) assert splits[0] == sample + '-100-110' assert splits[1] == sample + '-110-120' assert splits[2] == sample + '-120-130' assert splits[3] == sample + '-130-140' assert len(splits) == 4
def dyadcov(samples, genes, minp, maxp, smoothing, index): '''Finds the distribution of ditances between fragments and dyad.''' logging.basicConfig(filename='debug.log', level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') genes_info = pd.read_csv(genes, sep='\t', comment='#') genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1] sample_names = pd.read_csv(samples, header=None, sep='\t', comment='#')[0] if index != None: sample_names = [sample_names[index]] for sample in sample_names: dyad_coverage_sample(sample, genes_info, minp, maxp, smoothing) splits = sb.splits(sample) for split in splits: dyad_coverage_sample(split, genes_info, minp, maxp, smoothing)
def fitgaussian(samples, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, index): '''Fits gaussian curve to dyad coverage.''' logging.basicConfig(filename='debug.log', level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') sample_names = pd.read_csv(samples, header=None, sep='\t', comment='#')[0] if index != None: sample_names = [sample_names[index]] for sample in sample_names: fitgaussian_sample(sample, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin) splits = sb.splits(sample) for split in splits: fitgaussian_sample(split, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin)
def fit_double_gaussian(samples='samples.txt', absolute=False, components=False, gaussian=False, svg=False, verbose=False, center1=None, cmin1=None, cmax1=None, amp1=None, amin1=None, sigma1=None, smin1=None, center2=None, cmin2=None, cmax2=None, amp2=None, amin2=None, sigma2=None, smin2=None, suffix=None, index=None): '''Fits double gaussian curve to dyad coverage.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: fit_double_gaussian_sample(sample, absolute, components, gaussian, svg, verbose, center1, cmin1, cmax1, amp1, amin1, sigma1, smin1, center2, cmin2, cmax2, amp2, amin2, sigma2, smin2, suffix) splits = sb.splits(sample) for split in splits: fit_double_gaussian_sample(split, absolute, components, gaussian, svg, verbose, center1, cmin1, cmax1, amp1, amin1, sigma1, smin1, center2, cmin2, cmax2, amp2, amin2, sigma2, smin2, suffix)
def test_annotation_length_invalid(testdir, mock_testclass): annotation_length = s.annotation_length('chr1\t300') assert annotation_length == -1
def test_splitkey_invalid(testdir, mock_testclass): with pytest.raises(AttributeError): s.splitkey('POLR2A')
def test_splitkey_2(testdir, mock_testclass): splitkey = s.splitkey('POLR2A-350-680') assert splitkey == 350
def test_splitkey(testdir, mock_testclass): splitkey = s.splitkey('POLR2A-120-150') assert splitkey == 120
def test_splits_none(testdir, mock_testclass): sample = 'POLR2A' splits = s.splits(sample) assert len(splits) == 0
def test_annotation_length(testdir, mock_testclass): annotation_length = s.annotation_length('chr1\t100\t250\ttest1') assert annotation_length == 150
def test_split_samples_second(testdir, mock_testclass): samples = Path(__file__).parent.joinpath('samples.txt') s.split_sample = MagicMock() s.split_samples(samples, 1) s.split_sample.assert_called_once_with('ASDURF', 10, 100, 500)
def test_splitkey_noend(testdir, mock_testclass): with pytest.raises(AttributeError): s.splitkey('POLR2A-350')
def test_annotation_length_2(testdir, mock_testclass): annotation_length = s.annotation_length('chr1\t300\t680\ttest1') assert annotation_length == 380