Beispiel #1
0
    def setUp(self):
        sample_nums = range(1, 9)

        colnames = ["gene_id"] + ["sample" + str(x) for x in sample_nums]

        roles = ['feature_id']
        for i in range(len(sample_nums)):
            roles.append('sample')

        schema = Schema(column_names=colnames, column_roles=roles)

        schema.add_factor('treated', [False, True])
        schema.add_factor('sex', ['male', 'female'])

        factor_table = [('sample1', 'male', False), ('sample2', 'male', False),
                        ('sample3', 'female', False),
                        ('sample4', 'female', False),
                        ('sample5', 'male', True), ('sample6', 'male', True),
                        ('sample7', 'female', True),
                        ('sample8', 'female', True)]

        for row in factor_table:
            (name, sex, treated) = row
            schema.set_factor(name, 'sex', sex)
            schema.set_factor(name, 'treated', treated)

        self.schema = schema
Beispiel #2
0
    def setUp(self):
        sample_nums = range(1, 13)

        colnames = ["gene_id"] + ["sample" + str(x) for x in sample_nums]

        roles = ['feature_id']
        for i in range(len(sample_nums)):
            roles.append('sample')

        schema = Schema(column_names=colnames, column_roles=roles)

        schema.add_factor('age', [2, 20, 55])
        schema.add_factor('sex', ['male', 'female'])
        schema.add_factor('treated', [False, True])

        counter = 0

        self.assertEquals(schema.sample_num("sample1"), 0)
        self.assertEquals(schema.sample_num("sample7"), 6)

        for sex in ['male', 'female']:
            for age in [2, 20, 55]:
                for treated in [True, False]:
                    counter += 1
                    name = "sample" + str(counter)
                    schema.set_factor(name, 'sex', sex)
                    schema.set_factor(name, 'age', age)
                    schema.set_factor(name, 'treated', treated)
        self.schema = schema
Beispiel #3
0
    def schema(self):

        wf = current_workflow()
        columns = np.array(wf.field_names)
        roles = np.array(wf.column_roles)
        factors = self.factor_values.keys()

        if columns is None or roles is None or len(columns) == 0 or len(
                roles) == 0:
            raise Exception("I can't create a schema without columns or roles")

        schema = Schema(map(str, columns), map(str, wf.column_roles))

        for factor, values in self.factor_values.items():
            schema.add_factor(str(factor), map(str, values))

        counter = 0

        for i, c in enumerate(columns[roles == 'sample']):
            for j, f in enumerate(factors):
                try:
                    value = self.column_label_form.assignments[counter].data
                except IndexError as e:
                    raise Exception("No assignment " + str(counter))
                schema.set_factor(str(c), str(f), str(value))
                counter += 1

        return schema
Beispiel #4
0
    def test_ignore_columns(self):

        names = ["gene_id"]
        roles = ['feature_id']
        for i in range(8):
            names.append('sample_' + str(i))
            if (i % 2) == 0:
                roles.append('sample')
            else:
                roles.append(None)

        schema = Schema(column_names=names, column_roles=roles)
        self.assertEquals(len(schema.sample_column_names), 4)

        schema.add_factor('treated', [False, True])

        schema.set_factor('sample_0', 'treated', False)
        schema.set_factor('sample_2', 'treated', False)
        schema.set_factor('sample_4', 'treated', True)
        schema.set_factor('sample_6', 'treated', True)

        with self.assertRaises(Exception):
            schema.set_factor('sample_1' + str(i), 'treated', True)

        self.assertEquals(schema.possible_assignments(['treated']), [
            OrderedDict([('treated', False)]),
            OrderedDict([('treated', True)])
        ])

        self.assertEquals(
            schema.indexes_with_assignments(OrderedDict([('treated', False)])),
            [0, 1])

        self.assertEquals(
            schema.indexes_with_assignments(OrderedDict([('treated', True)])),
            [2, 3])

        self.assertEquals(
            schema.samples_with_assignments(OrderedDict([('treated', False)])),
            ['sample_0', 'sample_2'])

        self.assertEquals(
            schema.samples_with_assignments(OrderedDict([('treated', True)])),
            ['sample_4', 'sample_6'])

        out = StringIO()

        schema.save(out)
        loaded = Schema.load(out.getvalue())

        out2 = StringIO()
        loaded.save(out2)

        self.maxDiff = None
        self.assertEquals(out.getvalue(), out2.getvalue())
Beispiel #5
0
def init_schema(infile=None):
    """Creates a new schema based on the given infile.

    Does not save it or make any changes to the state of the file
    system.

    """
    csvfile = csv.DictReader(infile, delimiter="\t")
    roles = ['sample' for i in csvfile.fieldnames]
    roles[0] = 'feature_id'
    return Schema(column_names=csvfile.fieldnames, column_roles=roles)
Beispiel #6
0
    def test_schemas(self):
        rawfile = self.mdb.add_input_file(name="test.txt",
                                          description="Some comments",
                                          stream=StringIO("a\nb\nc\n"))

        schema_a = Schema()
        schema_a.add_factor('treated', [False, True])
        schema_a.set_columns(['id', 'a', 'b'],
                             ['feature_id', 'sample', 'sample'])
        schema_a.set_factor('a', 'treated', False)
        schema_a.set_factor('b', 'treated', True)

        schema_b = Schema()
        schema_b.add_factor('age', ['young', 'old'])
        schema_b.set_columns(['key', 'foo', 'bar'],
                             ['feature_id', 'sample', 'sample'])
        schema_b.set_factor('foo', 'age', 'young')
        schema_b.set_factor('bar', 'age', 'old')

        a = self.mdb.add_schema("First one", "The first one", schema_a,
                                rawfile)
        b = self.mdb.add_schema("Second", "Other", schema_b, rawfile)

        self.assertEquals(a.name, "First one")
        self.assertEquals(a.description, "The first one")

        schemas = self.mdb.all_schemas()
        self.assertEquals(len(schemas), 2)

        self.assertEquals(a.based_on_input_file_id, rawfile.obj_id)

        colnames = set()
        for s in schemas:
            schema = s.load()
            colnames.update(schema.column_names)
        self.assertEquals(colnames, set(['id', 'a', 'b', 'key', 'foo', 'bar']))

        schema_ids = self.mdb.schemas_based_on_input_file(
            a.based_on_input_file_id)
        self.assertTrue(a.obj_id in schema_ids)
        self.assertTrue(b.obj_id in schema_ids)
Beispiel #7
0
    def test_jobs(self):

        # Set up the raw file
        raw_file_meta = self.mdb.add_input_file(name="test.txt",
                                                description="Some comments",
                                                stream=StringIO("a\nb\nc\n"))

        schema = Schema()
        schema.add_factor('treated', [False, True])
        schema.set_columns(['id', 'a', 'b'],
                           ['feature_id', 'sample', 'sample'])
        schema.set_factor('a', 'treated', False)
        schema.set_factor('b', 'treated', True)

        schema_meta = self.mdb.add_schema("First one", "The first one", schema,
                                          raw_file_meta)

        a = self.mdb.add_job(name="job1",
                             description="Some job",
                             raw_file_meta=raw_file_meta,
                             schema_meta=schema_meta)
        b = self.mdb.add_job(name="job2",
                             description="Other job",
                             raw_file_meta=raw_file_meta,
                             schema_meta=schema_meta)
        # Make sure it returned the object appropriately
        self.assertEquals(a.name, "job1")
        self.assertEquals(a.description, "Some job")

        a = self.mdb.job(a.obj_id)
        self.assertEquals(a.raw_file_id, raw_file_meta.obj_id)
        self.assertEquals(a.schema_id, schema_meta.obj_id)
        self.assertFalse(a.imported)

        # Make sure we can list all input files
        jobs = self.mdb.all_jobs()
        self.assertEquals(len(jobs), 2)
        names = set(['job1', 'job2'])
        self.assertEquals(names, set([x.name for x in jobs]))

        job_ids = self.mdb.jobs_for_schema(schema_meta.obj_id)
        self.assertTrue(a.obj_id in job_ids)
        self.assertTrue(b.obj_id in job_ids)

        job_ids = self.mdb.jobs_for_raw_file(raw_file_meta.obj_id)
        self.assertTrue(a.obj_id in job_ids)
        self.assertTrue(b.obj_id in job_ids)
Beispiel #8
0
    def setup_paired_schema(self):
        persons = 'abc'
        treateds = 'yn'

        self.paired_schema = Schema(['id', 'ay', 'an', 'by', 'bn', 'cy', 'cn'],
                                    [
                                        'feature_id', 'sample', 'sample',
                                        'sample', 'sample', 'sample', 'sample'
                                    ])

        self.paired_schema.add_factor('person', list(persons))
        self.paired_schema.add_factor('treated', list(treateds))

        for p in persons:
            for t in treateds:
                col = p + t
                self.paired_schema.set_factor(col, 'person', p)
                self.paired_schema.set_factor(col, 'treated', t)
Beispiel #9
0
    def setup_three_cond_schema(self):
        genders = 'mf'
        dosages = 'lmh'
        repnums = map(str, range(4))

        prod = list(product(genders, dosages, repnums))

        col_names = ['id'] + ["".join(x) for x in prod]
        col_roles = ['feature_id'] + list(repeat('sample', len(prod)))

        self.three_cond_schema = Schema(col_names, col_roles)

        self.three_cond_schema.add_factor('gender', list(genders))
        self.three_cond_schema.add_factor('dosage', list(dosages))

        for (g, d, r) in prod:
            col = g + d + r
            self.three_cond_schema.set_factor(col, 'gender', g)
            self.three_cond_schema.set_factor(col, 'dosage', d)
Beispiel #10
0
    def test_model_dummy_vars_1(self):

        sample_nums = range(1, 13)

        colnames = ["gene_id"] + ["sample" + str(x) for x in sample_nums]
        roles = ['feature_id']
        for i in range(len(sample_nums)):
            roles.append('sample')

        schema = Schema(column_names=colnames, column_roles=roles)

        schema.add_factor('age', [2, 20, 55])
        schema.add_factor('sex', ['male', 'female'])
        schema.add_factor('treated', [False, True])

        counter = 0

        self.assertEquals(schema.sample_num("sample1"), 0)
        self.assertEquals(schema.sample_num("sample7"), 6)

        for sex in ['male', 'female']:
            for age in [2, 20, 55]:
                for treated in [True, False]:
                    counter += 1
                    name = "sample" + str(counter)
                    schema.set_factor(name, 'sex', sex)
                    schema.set_factor(name, 'age', age)
                    schema.set_factor(name, 'treated', treated)

        dummies = dummy_vars(schema, ['age', 'treated'], level=2)

        expected = DummyVarTable(({}, {
            'age': 20
        }, {
            'age': 55
        }, {
            'treated': True
        }, {
            'age': 20,
            'treated': True
        }, {
            'age': 55,
            'treated': True
        }), [
            DummyVarAssignment(factor_values=(2, False),
                               bits=(True, False, False, False, False, False),
                               indexes=['sample2', 'sample8']),
            DummyVarAssignment(factor_values=(2, True),
                               bits=(True, False, False, True, False, False),
                               indexes=['sample1', 'sample7']),
            DummyVarAssignment(factor_values=(20, False),
                               bits=(True, True, False, False, False, False),
                               indexes=['sample4', 'sample10']),
            DummyVarAssignment(factor_values=(20, True),
                               bits=(True, True, False, True, True, False),
                               indexes=['sample3', 'sample9']),
            DummyVarAssignment(factor_values=(55, False),
                               bits=(True, False, True, False, False, False),
                               indexes=['sample6', 'sample12']),
            DummyVarAssignment(factor_values=(55, True),
                               bits=(True, False, True, True, False, True),
                               indexes=['sample5', 'sample11'])
        ])

        self.assertEquals(dummies, expected)