Example #1
0
    def produce(self, unknown_args=None):
        opts = self.namespace
        name = opts.name or self.name

        if opts.files:
            # load all files
            sample_collections = []

            if callable(opts.header):
                opts.header = [opts.header(f) for f in opts.files]

            for i, file_obj in enumerate(opts.files):

                use_header = isinstance(opts.header[i], int)

                sample_collections.append(
                    SampleCollection.from_file(
                        f'Sample collection, part {i} of {name}',
                        file_obj,
                        columns_selector=opts.columns[i].get_iterator
                        if opts.columns else None,
                        samples=opts.samples[i] if opts.samples else None,
                        reverse_selection=getattr(opts, 'reverse', False),
                        delimiter=opts.delimiter,
                        header_line=opts.header[i] if use_header else None,
                        use_header=use_header,
                        prefix=opts.header[i] if not use_header else None,
                        description_column=opts.description_column))

            opts.sample_collection = sum(sample_collections,
                                         SampleCollection(name))
        return opts
Example #2
0
def minimal_data():

    tp53 = Gene('TP53')
    map2k1 = Gene('MAP2K1')

    case = SampleCollection('case', [Sample('1', {tp53: 2, map2k1: 1})])
    control = SampleCollection('control', [Sample('1', {tp53: 1, map2k1: 1})])

    return tp53, map2k1, case, control
Example #3
0
def test_ttest():
    data1 = {'BAD': 1.2345, 'FUCA2': 6.5432}
    data2 = {'BAD': 2.3456, 'FUCA2': 7.6543}
    data3 = {'BAD': 6.3456, 'FUCA2': 11.6543}
    data4 = {'BAD': 7.1111, 'FUCA2': 9.9711}

    tumour_samples = [Sample.from_names('Tumour_1', data1), Sample.from_names('Tumour_2', data2)]
    normal_samples = [Sample.from_names('Normal_1', data3), Sample.from_names('Normal_2', data4)]

    tumour = SampleCollection('Tumour', tumour_samples)
    normal = SampleCollection('Normal', normal_samples)

    experiment = Experiment(case=tumour, control=normal)
    tt = ttest(experiment)
    assert isinstance(tt, pd.Series)
    assert all(gene in list(tt.keys()) for gene in experiment.get_all().genes)
Example #4
0
def test_from_csv():

    with temp_text_file(csv_contents) as csv_file:

        collection = SampleCollection.from_csv_file('all_samples.csv',
                                                    csv_file)

        assert len(collection.samples) == 4

    with temp_text_file(csv_contents) as csv_file:

        with warns(
                UserWarning,
                match=
                'You are using not comma delimiter for what looks like csv file.'
        ):
            SampleCollection.from_csv_file('all_samples.csv',
                                           csv_file,
                                           delimiter='\t')
Example #5
0
def test_init():
    genes1 = {Gene('BAD'): 1.2345, Gene('FUCA2'): 6.5432}
    genes2 = {Gene('BAD'): 2.3456, Gene('FUCA2'): 7.6543}

    samples = [Sample('Tumour_1', genes1), Sample('Tumour_2', genes2)]

    sample_collection = SampleCollection('Tumour', samples)

    assert sample_collection.name == 'Tumour'
    assert all(isinstance(k, Sample) for k in sample_collection.samples)
def test_get_all():
    data1 = {'BAD': 1.2345, 'FUCA2': 6.5432}
    data2 = {'BAD': 2.3456, 'FUCA2': 7.6543}
    data3 = {'BAD': 3.4567}

    tumour_samples = [
        Sample.from_names('Tumour_1', data1),
        Sample.from_names('Tumour_2', data2)
    ]
    normal_samples = [Sample.from_names('Normal_1', data3)]

    tumour = SampleCollection('Tumour', tumour_samples)
    normal = SampleCollection('Normal', normal_samples)

    experiment_samples = Experiment(case=tumour, control=normal).get_all()

    assert isinstance(experiment_samples, SampleCollection)
    assert all(label in experiment_samples.labels
               for label in tumour.labels + normal.labels)
def test_init():
    data1 = {'BAD': 1.2345, 'FUCA2': 6.5432}
    data2 = {'BAD': 2.3456, 'FUCA2': 7.6543}
    data3 = {'BAD': 3.4567}

    tumour_samples = [
        Sample.from_names('Tumour_1', data1),
        Sample.from_names('Tumour_2', data2)
    ]
    normal_samples = [Sample.from_names('Normal_1', data3)]

    tumour = SampleCollection('Tumour', tumour_samples)
    normal = SampleCollection('Normal', normal_samples)

    experiment = Experiment(case=tumour, control=normal)

    assert isinstance(experiment.case, SampleCollection)
    assert isinstance(experiment.control, SampleCollection)

    assert experiment.case == tumour
    assert experiment.control == normal
Example #8
0
def test_from_gct():

    with temp_text_file(gct_contents) as gct_file:

        expected_warning = 'Samples count \(4\) does not match with the 3 declared in all_samples.gct file.'

        with warns(UserWarning, match=expected_warning):
            collection = SampleCollection.from_gct_file(
                'all_samples.gct', gct_file)

        assert len(collection.samples) == 4

        assert collection.labels == ['NORM-1', 'GBM-1', 'GBM-2', 'OV-1']

    # replace version definition
    lines = gct_contents.split('\n')
    lines[0] = '#1.1'
    old_content = '\n'.join(lines)

    with temp_text_file(old_content) as old_gct_file:
        with warns(UserWarning, match='Unsupported version of GCT file'):
            SampleCollection.from_gct_file('Outdated file', old_gct_file)
Example #9
0
def shuffle_and_divide(merged_collection, midpoint):
    shuffled = copy(merged_collection.samples)
    shuffle(shuffled)
    return (SampleCollection('Random collection', shuffled[:midpoint]),
            SampleCollection('Random collection', shuffled[midpoint:]))