Example #1
0
class DataParser(object):
    def __init__(self,
                 datafile=None,
                 sheet=0,
                 skiplines=0,
                 header=None,
                 etype=None):
        msg = 'DataParser: datafile=%s sheet=%s skiplines=%s header=%s etype=%s' % (
            datafile, str(sheet), str(skiplines), str(header), str(etype))
        print(msg)
        logging.info(msg)
        self.datafile = datafile  # full pathname to data file
        self.resource_dir = findResourceDir()
        configdb = join(self.resource_dir, 'opexconfig.db')
        if not access(configdb, R_OK):
            print(configdb)
            raise IOError('Cannot find config database {}'.format(configdb))
        try:
            self.dbi = DBI(configdb)
            self.etype = etype
            if etype is not None:
                self.info = self.dbi.getInfo(etype)
                self.fields = self.dbi.getFields(etype)
            else:
                self.info = None
                self.fields = None
            self.subjects = None
            self.incorrect = self.dbi.getIDs()
            if self.datafile is not None and len(self.datafile) > 0:
                (bname, extn) = splitext(basename(self.datafile))
                self.ftype = extn  # extension - xlsx or csv
                self.sheet = sheet
                self.skiplines = skiplines
                self.header = header
                self._loadData()
        except Exception as e:
            raise e

    def getInfoFromFile(self, etype):
        # Read expt info
        info = None
        try:
            opex = pandas.read_csv(join(self.resource_dir, 'opex.csv'))
            info = opex[opex['Expt'] == etype]
        except Exception as e:
            raise ValueError("Unable to get expt info from file", e)
        return info

    def getIdsFromFile(self):
        # Read expt info
        info = None
        try:
            info = pandas.read_csv(join(self.resource_dir, 'incorrectIds.csv'))
        except Exception as e:
            raise ValueError("Unable to get ids from file", e)
        return info

    def __checkSID(self, sid):
        """
        Replace known incorrect IDs from db
        :param sid:
        :return:
        """
        if self.dbi is not None:
            rsid = self.dbi.getCorrectID(sid)
            if rsid != sid:
                msg = 'Subject: %s corrected to %s' % (sid, rsid)
                logging.warning(msg)
        else:
            rsid = sid
        return rsid

    def _loadData(self):
        if self.ftype == '.xlsx' or self.ftype == '.xls':
            try:
                if self.header is None:
                    self.data = pandas.read_excel(self.datafile,
                                                  skiprows=self.skiplines,
                                                  sheet_name=self.sheet,
                                                  skip_blank_lines=True,
                                                  encoding='utf-8')
                else:
                    self.data = pandas.read_excel(self.datafile,
                                                  skiprows=self.skiplines,
                                                  sheet_name=self.sheet,
                                                  skip_blank_lines=True,
                                                  header=self.header,
                                                  encoding='utf-8')
            except IndexError as e:
                msg = 'Excel sheet number/name expected was {} but not found: '.format(
                    self.sheet) + e.message
                logging.error(msg)
                print(msg)
        elif self.ftype == '.csv':
            self.data = pandas.read_csv(self.datafile, skip_blank_lines=True)
        else:
            self.data = None
        if self.data is not None:
            msg = 'Data loaded from %s' % self.datafile
            logging.info(msg)
            print(msg)
            self.data.dropna(how="all", axis=0,
                             inplace=True)  # cleanup if rows are all NaN
            self.data.fillna("")  # replace remaining NaNs with empty string

        else:
            print('No data to load')

    def sortSubjects(self, subjectfield='ID'):
        '''
            Sort data into subjects by participant ID
             - this should be overwritten if the data is organized differently
        '''
        self.subjects = dict()
        if self.data is not None:
            if subjectfield not in self.data.columns:
                raise ValueError('Subject ID field not present: ',
                                 subjectfield)
            ids = self.data[subjectfield].unique()
            for sid in ids:
                if len(str(sid)) == 6:
                    sidkey = self.__checkSID(sid)
                    self.subjects[sidkey] = self.data[self.data[subjectfield]
                                                      == sid]
            msg = 'Subjects loaded=%d' % len(self.subjects)
            print(msg)

    def formatDobNumber(self, orig):
        """
        Reformats DOB string from Excel data float to yyyy-mm-dd
        """
        dateoffset = 693594
        dt = datetime.fromordinal(dateoffset + int(orig))
        return dt.strftime("%Y-%m-%d")

    def formatCondensedDate(self, orig):
        """
        Reformats date number from Excel to yyyymmdd
        """
        dateoffset = 693594
        dt = datetime.fromordinal(dateoffset + int(orig))
        return dt.strftime("%Y%m%d")

    def getPrefix(self):
        prefix = None
        if self.info is not None:
            prefix = self.info['prefix']
        return prefix

    def getxsd(self):
        xsd = None
        if self.info is not None:
            xsd = self.info['xsitype']
        return xsd
class TestDBquery(unittest.TestCase):
    def setUp(self):
        self.resourcedir = findResourceDir()
        configdb = join(self.resourcedir, 'opexconfig_test.db')
        self.dbi = DBI(configdb)
        self.dbi.getconn()

    def tearDown(self):
        self.dbi.conn.close()

    def test_getIDs(self):
        data = self.dbi.getIDs()
        self.assertGreater(len(data), 0)

    def test_updateIDs(self):
        df = pandas.read_csv(join('..', 'resources', 'incorrectIds.csv'))
        idlist = [(d['INCORRECT'], d['CORRECT']) for i, d in df.iterrows()]
        cnt = self.dbi.addIDs(idlist)
        expected = len(idlist)
        self.assertEqual(expected, cnt)

    def test_getRunOptions(self):
        data = self.dbi.getRunOptions()
        self.assertGreater(len(data), 0)

    def test_getFields(self):
        etype = 'CANTAB MOT'
        expected = [u'MOTML', u'MOTSDL', u'MOTTC', u'MOTTE']
        data = self.dbi.getFields(etype)
        print(etype, ": ", data)
        self.assertGreater(len(data), 0)
        self.assertListEqual(expected, data)

    def test_getInfo(self):
        etype = 'MULTIPLEX'
        expected = {'prefix': u'MPX', 'xsitype': u'opex:bloodMultiplexData'}
        data = self.dbi.getInfo(etype)
        print(etype, ": ", data)
        self.assertGreater(len(data), 0)
        self.assertDictEqual(expected, data)

    def test_getInfo_missing(self):
        etype = 'CANTAB'
        data = self.dbi.getInfo(etype)
        print(etype, ": ", data)
        self.assertIsNone(data)

    def test_getCorrectID(self):
        incorrectid = '1040DR'
        correctid = '1040DA'
        cid = self.dbi.getCorrectID(incorrectid)
        self.assertEqual(correctid, cid)

    def test_getCorrectID_missing(self):
        incorrectid = '1020HC'
        cid = self.dbi.getCorrectID(incorrectid)
        self.assertEqual(incorrectid, cid)

    def test_getDatelessExpts(self):
        data = self.dbi.getDatelessExpts()
        self.assertGreater(len(data), 0)

    def test_getExpts(self):
        data = self.dbi.getExpts()
        self.assertGreater(len(data), 0)

    def test_getXsitypeFromPrefix(self):
        prefix = 'MPX'
        expected = 'opex:bloodMultiplexData'
        data = self.dbi.getXsitypeFromPrefix(prefix)
        self.assertEqual(expected, data)

    def test_getTotal(self):
        expt = 'GODIN'
        expected = 5
        data = self.dbi.getTotal(expt)
        self.assertEqual(expected, data)

    def test_getInterval(self):
        expt = 'GODIN'
        expected = 3
        data = self.dbi.getInterval(expt)
        self.assertEqual(expected, data)

    def test_getInfo_TASK(self):
        """ Checking taskret and taskencode """
        expt = 'TASKRET'
        expected = 'opex:fmritaskret'
        data = self.dbi.getInfo(expt)
        self.assertEqual(expected, data['xsitype'])
        fields = self.dbi.getFields(expt)
        self.assertGreater(len(fields), 0)
        expt = 'TASKENCODE'
        expected = 'opex:fmritaskencode'
        data = self.dbi.getInfo(expt)
        self.assertEqual(expected, data['xsitype'])
        fields = self.dbi.getFields(expt)
        self.assertGreater(len(fields), 0)