Beispiel #1
0
    def test_end2end_known_test_data(self):
        if rs.app.config['RUN_TESTS']:
            # training/test data and output files
            #label_file = '../data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'
            label_file = rs.app.config['LABEL_FILE']
            #self.report_path = '../data/input/SDS_PV2_combined/reports'
            #self.report_path = rs.app.config['TEXT_REPORT_DIR']
            label_data = pd.read_csv(label_file)
            key_start = int(rs.app.config['REGION_COL_START'])
            key_stop = int(rs.app.config['REGION_COL_STOP'])+1
            region_keys = label_data.columns[key_start:key_stop]
            # set the numpy random seed so results are reproducible
            randstate = RandomState(987654321)

            # partition the data
            pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
            test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
            randstate.shuffle(test_mask)
            test_labels = label_data.iloc[test_mask]
            #report_path = '../data/input/SDS_PV2_combined/reports'
            report_path = rs.app.config['TEXT_REPORT_DIR']
            test_reports = [self.load_report('{0}/{1}.txt'.format(report_path, pid)) for pid in test_labels['pid']]
            min_acc = float(rs.app.config['MIN_ACCURACY'])

            # send reports individually as multiple requests
            accuracy = [0,0,0,0]
            region_labels = ['inner','middle', 'outer', 'mastoid']
            for idx, tpl in enumerate(zip(test_labels['pid'],test_reports)):
                data = {tpl[0]:tpl[1]}
                request_body = json.dumps(data)
                rv = self.app.post('/classify',data=request_body, content_type='application/json')
                rdata = json.loads(rv.data)
                for jdx,label in enumerate(region_labels):
                    act = test_labels[label].iloc[idx]
                    pred = rdata[tpl[0]][jdx]
                    if act == pred:
                        accuracy[jdx] += 1

            accuracy = [v/float(len(test_labels)) for v in accuracy]
            for v in accuracy:
                self.assertGreater(v, min_acc, 'Failed accuracy on individual post test {0}'.format(accuracy))

            #send reports in one batch requiest
            accuracy = [0,0,0,0]
            region_labels = ['inner','middle', 'outer', 'mastoid']
            data = {}
            for idx, tpl in enumerate(zip(test_labels['pid'],test_reports)):
                data[tpl[0]]=tpl[1]
            request_body = json.dumps(data)
            rv = self.app.post('/classify',data=request_body, content_type='application/json')
            rdata = json.loads(rv.data)
            for idx,pid in enumerate(test_labels['pid']):
                for jdx,label in enumerate(region_labels):
                    act = test_labels[label].iloc[idx]
                    pred = rdata[pid][jdx]
                    if act == pred:
                        accuracy[jdx] += 1

            accuracy = [v/float(len(test_labels)) for v in accuracy]
            for v in accuracy:
                self.assertGreater(v, min_acc, 'Failed accuracy on batch post test {0}'.format(accuracy))
Beispiel #2
0
    report_path = './data/input/SDS_PV2_combined/reports_single_find_impr'


    #load test set data - same set used for ML tests
    seed = 987654321
    # set the numpy random seed so results are reproducible
    rs = RandomState(987654321)

    # set common path variables
    label_file = './data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'

    # read data
    label_data = pd.read_csv(label_file)

    # partition the data
    pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
    train_mask = np.concatenate((pos_cases[0], neg_cases[0]))
    test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
    rs.shuffle(train_mask)
    rs.shuffle(test_mask)
    train_labels = label_data.iloc[train_mask]
    test_labels = label_data.iloc[test_mask]
    # read in the text reports
    train_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in train_labels['pid']]
    test_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in test_labels['pid']]

    #import keywords
    keywords = {}
    with open(keyword_file, 'r') as f:
        key = ""
        for line in f.readlines():
Beispiel #3
0
    def test_end2end_known_test_data(self):
        if rs.app.config['RUN_TESTS']:
            # training/test data and output files
            #label_file = '../data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'
            label_file = rs.app.config['LABEL_FILE']
            #self.report_path = '../data/input/SDS_PV2_combined/reports'
            #self.report_path = rs.app.config['TEXT_REPORT_DIR']
            label_data = pd.read_csv(label_file)
            key_start = int(rs.app.config['REGION_COL_START'])
            key_stop = int(rs.app.config['REGION_COL_STOP']) + 1
            region_keys = label_data.columns[key_start:key_stop]
            # set the numpy random seed so results are reproducible
            randstate = RandomState(987654321)

            # partition the data
            pos_cases, neg_cases = wrangle.partion(label_data['doc_norm'] == 1,
                                                   label_data,
                                                   ratios=[0.8, 0.2])
            test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
            randstate.shuffle(test_mask)
            test_labels = label_data.iloc[test_mask]
            #report_path = '../data/input/SDS_PV2_combined/reports'
            report_path = rs.app.config['TEXT_REPORT_DIR']
            test_reports = [
                self.load_report('{0}/{1}.txt'.format(report_path, pid))
                for pid in test_labels['pid']
            ]
            min_acc = float(rs.app.config['MIN_ACCURACY'])

            # send reports individually as multiple requests
            accuracy = [0, 0, 0, 0]
            region_labels = ['inner', 'middle', 'outer', 'mastoid']
            for idx, tpl in enumerate(zip(test_labels['pid'], test_reports)):
                data = {tpl[0]: tpl[1]}
                request_body = json.dumps(data)
                rv = self.app.post('/classify',
                                   data=request_body,
                                   content_type='application/json')
                rdata = json.loads(rv.data)
                for jdx, label in enumerate(region_labels):
                    act = test_labels[label].iloc[idx]
                    pred = rdata[tpl[0]][jdx]
                    if act == pred:
                        accuracy[jdx] += 1

            accuracy = [v / float(len(test_labels)) for v in accuracy]
            for v in accuracy:
                self.assertGreater(
                    v, min_acc,
                    'Failed accuracy on individual post test {0}'.format(
                        accuracy))

            #send reports in one batch requiest
            accuracy = [0, 0, 0, 0]
            region_labels = ['inner', 'middle', 'outer', 'mastoid']
            data = {}
            for idx, tpl in enumerate(zip(test_labels['pid'], test_reports)):
                data[tpl[0]] = tpl[1]
            request_body = json.dumps(data)
            rv = self.app.post('/classify',
                               data=request_body,
                               content_type='application/json')
            rdata = json.loads(rv.data)
            for idx, pid in enumerate(test_labels['pid']):
                for jdx, label in enumerate(region_labels):
                    act = test_labels[label].iloc[idx]
                    pred = rdata[pid][jdx]
                    if act == pred:
                        accuracy[jdx] += 1

            accuracy = [v / float(len(test_labels)) for v in accuracy]
            for v in accuracy:
                self.assertGreater(
                    v, min_acc,
                    'Failed accuracy on batch post test {0}'.format(accuracy))