Example #1
0
 def test_imports_script_src(self, mock_db):
     # Checking js functions calls work from <script src=""/>
     self._extractor_script = ImportHandler(self._plan_for_script, PARAMS)
     row = self._extractor_script.next()
     self.assertTrue(mock_db.called)
     self.assertEqual(row['test_script'], 99)
     self.assertEqual(row['test_script_tag'], 99)
Example #2
0
    def test_pig_datasource(self, sleep_mock, sqoop_mock):
        # Amazon mock
        self.pill.attach(
            self.session,
            os.path.abspath(
                os.path.join(os.path.dirname(__file__),
                             'placebo_responses/importhandler/pigxml')))
        self.pill.playback()

        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'pig-train-import-handler.xml'))

        # Sqoop import subprocess mock
        process_mock = Mock()
        attrs = {'wait.return_value': 0, 'stdout.readlines.return_value': []}
        process_mock.configure_mock(**attrs)
        sqoop_mock.return_value = process_mock

        with patch('psycopg2.extras.DictCursor.execute'):
            with patch('psycopg2.connect'):
                self._extractor = ImportHandler(self._plan, PARAMS)

        pig_ds = self._extractor.plan.datasources['pig']
        # Checking iterator
        row = self._extractor.next()
        self.assertEquals(row['opening_id'], 57)
Example #3
0
 def test_store_data_csv(self, mock_db):
     self._extractor = ImportHandler(self._plan, PARAMS)
     self._extractor.store_data_csv("data.csv.bak")
     self.assertTrue(os.path.isfile("data.csv.bak"))
     with open("data.csv.bak") as fp:
         reader = csv.reader(fp)
         rows = [row for row in reader]
         self.assertEquals(len(rows), 2)
     os.remove("data.csv.bak")
Example #4
0
 def composite_test(self, mock_db):
     self._extractor = ImportHandler(self._plan, {
         'start': '2012-12-03',
         'end': '2012-12-04',
     })
     row = self._extractor.next()
     self.assertEqual(row['country_pair'], 'Australia,Philippines')
     self.assertEqual(
         row['tsexams']['English Spelling Test (U.S. Version)'], 5)
Example #5
0
    def test_validate_input_params(self):
        self._extractor = ImportHandler(self._plan, PARAMS)
        with self.assertRaisesRegexp(ImportHandlerException,
                                     "Missing input parameters"):
            self._extractor.process_input_params({'end': '2013-01-30'})

        with self.assertRaisesRegexp(ImportHandlerException,
                                     "Missing input parameters"):
            self._extractor.process_input_params({})

        with self.assertRaisesRegexp(ImportHandlerException,
                                     "Missing input parameters"):
            self._extractor.process_input_params(None)
Example #6
0
    def test_store_data_json(self, mock_db):
        self._extractor = ImportHandler(self._plan, PARAMS)
        self._extractor.store_data_json("data.json.bak")
        self.assertTrue(os.path.isfile("data.json.bak"))
        with open("data.json.bak") as fp:
            json_data = fp.read()
            data = json.loads(json_data)
            self.assertEquals(data['application_id'], 555)
        os.remove("data.json.bak")

        self._extractor.store_data_json("data.gz.bak", True)
        self.assertTrue(os.path.isfile("data.gz.bak"))
        os.remove("data.gz.bak")
Example #7
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        transformer = Transformer(args.path)
    except (TransformerSchemaException, IOError) as e:
        logging.warn('Invalid feature model: %s' % e.message)
        print_exception(e)
        return INVALID_TRANSFORMER_CONFIG

    try:
        if args.input is not None:
            file_format = os.path.splitext(args.input)[1][1:]
            with open(args.input, 'r') as train_fp:
                transformer.train(
                    streamingiterload(train_fp, source_format=file_format))
        elif args.extraction is not None:
            train_context = list_to_dict(args.train_params)

            try:
                plan = ExtractionPlan(args.extraction)
                train_handler = ImportHandler(plan, train_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in train_context.items():
                logging.info('%s --> %s' % (key, value))
            transformer.train(train_handler)
        else:
Example #8
0
class InputDatasourceTest(unittest.TestCase):
    def setUp(self):
        from cloudml.importhandler.importhandler import ExtractionPlan
        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'input-datasource-handler.xml'))

    def test_json(self):
        from cloudml.importhandler.importhandler import ImportHandler
        self._extractor = ImportHandler(
            self._plan, {
                'contractor_info':
                '{ "skills":[{"skl_status":"0","ts_tests_count"\
:"0","skl_name":"microsoft-excel","skl_external_link":"http:\/\/en.wikipedia.\
org\/wiki\/Microsoft_Excel","skl_has_tests":"1","skl_pretty_name":"Microsoft\
 Excel","skill_uid":"475721704063008779","skl_rank":"1","skl_description":\
 "Microsoft Excel is a proprietary commercial spreadsheet application written\
 and distributed by Microsoft for Microsoft Windows and Mac OS X. It features\
 calculation, graphing tools, pivot tables, and a macro programming language\
 called Visual Basic for Applications."},{"skl_status":"0","ts_tests_count":\
 "0","skl_name":"microsoft-word","skl_external_link":"http:\/\/en.wikipedia.\
 org\/wiki\/Microsoft_Word","skl_has_tests":"1","skl_pretty_name":"Microsoft\
  Word","skill_uid":"475721704071397377","skl_rank":"2","skl_description":\
  "Microsoft Office Word is a word processor designed by Microsoft."}]}',
            })
        row = self._extractor.next()
        self.assertEqual(row['contractor.skills'],
                         'microsoft-excel,microsoft-word')
Example #9
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        if args.user_params is not None:
            param_list = [x.split('=', 1) for x in args.user_params]
            context = dict((key, value) for (key, value) in param_list)
        else:
            context = {}

        logging.info('User-defined parameters:')
        for key, value in context.items():
            logging.info('%s --> %s' % (key, value))

        try:
            plan = ExtractionPlan(args.path)
            extractor = ImportHandler(plan, context)

        except ImportHandlerException, e:
            logging.warn('Invalid extraction plan: {}'.format(e.message))
            print_exception(e)
            return INVALID_EXTRACTION_PLAN

        if args.output is not None:
            logging.info('Storing data to %s...' % args.output)
            getattr(extractor, 'store_data_{}'.format(args.format),
                    extractor.store_data_json)(args.output)

            logging.info('Total %s lines' % (extractor.count, ))
            logging.info('Ignored %s lines' % (extractor.ignored, ))
Example #10
0
class HttpXMLPlanTest(unittest.TestCase):
    def setUp(self):
        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'http-train-import-handler.xml'))

    def test_http_datasource(self):
        with HTTMock(http_mock):
            self._extractor = ImportHandler(self._plan, PARAMS)
            row = self._extractor.next()
            self.assertEqual(row['application_id'], 123456)

    def test_http_query(self):
        with HTTMock(http_mock):
            self._plan.entity.query = '/some/other/path.json'
            self._extractor = ImportHandler(self._plan, PARAMS)
            row = self._extractor.next()
            self.assertEqual(row['application_id'], 78910)
Example #11
0
    def test_imports(self, mock_db):
        self._extractor = ImportHandler(self._plan, PARAMS)
        row = self._extractor.next()
        self.assertTrue(mock_db.called)

        # Checking types
        self.assertEqual(row['check_float'], float(ROW["float_field"]))
        self.assertEqual(row['check_string'], ROW["float_field"])
        self.assertEqual(row['check_int'], int(ROW["int_field"]))
        self.assertEqual(row['check_boolean'], True)
        self.assertEqual(row['check_integer_with_float'], None)
        self.assertEqual(row['check_json'], ROW["json_field"])
        self.assertEqual(row['check_json_jsonpath'], "Professional and \
experienced person")

        # Checking subentries as json datasources
        self.assertEqual(row['employer.country'], 'Philippines')

        # Checking jsonpath and join
        self.assertEqual(row['autors'], 'Nigel and Evelyn')

        # Checking regex and split
        self.assertEqual(row['say_hello'], 'hello')
        self.assertEqual(row['words'], ['Words', 'words', 'words'])

        # Checking javascript func
        self.assertEqual(row['test_script'], 99)
        self.assertEqual(row['test_script_tag'], 99)

        # Checking dataFormat
        self.assertEqual(row['date'], datetime(2014, 6, 1, 13, 33))

        # Checking template
        self.assertEqual(row['template'],
                         "Greatings: hello and hi and pruvit.")

        # Checking global nested datasources
        self.assertEqual(row['application_title'], 'Application Title')
        self.assertEqual(
            mock_db.call_args_list[1][0][0],
            "SELECT title FROM applications where id==%s;" %
            ROW['application'])
Example #12
0
class CsvXMLPlanTest(unittest.TestCase):
    def setUp(self):
        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'csv-train-import-handler.xml'))

    def test_csv_datasource(self):
        self._extractor = ImportHandler(self._plan, PARAMS)
        row = self._extractor.next()
        self.assertEqual(row['class'], 'hire')
        self.assertEqual(row['money'], 10)
Example #13
0
class CompositeTypeTest(unittest.TestCase):
    def setUp(self):
        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'composite-type-import-handler.xml'))

    @patch('cloudml.importhandler.datasources.DbDataSource._get_iter',
           return_value=db_row_iter_mock())
    def composite_test(self, mock_db):
        self._extractor = ImportHandler(self._plan, {
            'start': '2012-12-03',
            'end': '2012-12-04',
        })
        row = self._extractor.next()
        self.assertEqual(row['country_pair'], 'Australia,Philippines')
        self.assertEqual(
            row['tsexams']['English Spelling Test (U.S. Version)'], 5)
Example #14
0
class PigXMLPlanTest(unittest.TestCase):
    PIG_DS = 'cloudml.importhandler.datasources.PigDataSource'

    def setUp(self):
        super(PigXMLPlanTest, self).setUp()
        self.pill = StreamPill(debug=True)
        self.session = boto3.session.Session()
        boto3.DEFAULT_SESSION = self.session

    @patch('subprocess.Popen')
    @patch('time.sleep', return_value=None)
    def test_pig_datasource(self, sleep_mock, sqoop_mock):
        # Amazon mock
        self.pill.attach(
            self.session,
            os.path.abspath(
                os.path.join(os.path.dirname(__file__),
                             'placebo_responses/importhandler/pigxml')))
        self.pill.playback()

        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'pig-train-import-handler.xml'))

        # Sqoop import subprocess mock
        process_mock = Mock()
        attrs = {'wait.return_value': 0, 'stdout.readlines.return_value': []}
        process_mock.configure_mock(**attrs)
        sqoop_mock.return_value = process_mock

        with patch('psycopg2.extras.DictCursor.execute'):
            with patch('psycopg2.connect'):
                self._extractor = ImportHandler(self._plan, PARAMS)

        pig_ds = self._extractor.plan.datasources['pig']
        # Checking iterator
        row = self._extractor.next()
        self.assertEquals(row['opening_id'], 57)
Example #15
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        with open(args.path, 'r') as fp:
            trainer = load_trainer(fp)
    except (IOError, InvalidTrainerFile) as exc:
        logging.warn('Invalid trainer file: {0!s}'.format(exc))
        print_exception(exc)
        return INVALID_TRAINER

    try:
        iterator = None
        if args.input is not None:
            # Read evaluation data from file.
            eval_fp = open(args.input, 'r')
            file_format = determine_data_format(args.input)
            iterator = streamingiterload(eval_fp, source_format=file_format)
        elif args.extraction is not None:
            # Use import handler
            try:
                eval_context = list_to_dict(args.eval_params)
                plan = ExtractionPlan(args.extraction)
                eval_handler = ImportHandler(plan, eval_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in eval_context.items():
                logging.info('%s --> %s' % (key, value))

            iterator = eval_handler
        else:
Example #16
0
 def test_csv_datasource(self):
     self._extractor = ImportHandler(self._plan, PARAMS)
     row = self._extractor.next()
     self.assertEqual(row['class'], 'hire')
     self.assertEqual(row['money'], 10)
Example #17
0
class ImportHandlerTest(unittest.TestCase):
    def setUp(self):
        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml', 'train-import-handler.xml'))
        self._plan_for_script = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'train-import-handler-script-file.xml'))

    @patch('cloudml.importhandler.datasources.DbDataSource._get_iter',
           return_value=db_iter_mock())
    def test_imports(self, mock_db):
        self._extractor = ImportHandler(self._plan, PARAMS)
        row = self._extractor.next()
        self.assertTrue(mock_db.called)

        # Checking types
        self.assertEqual(row['check_float'], float(ROW["float_field"]))
        self.assertEqual(row['check_string'], ROW["float_field"])
        self.assertEqual(row['check_int'], int(ROW["int_field"]))
        self.assertEqual(row['check_boolean'], True)
        self.assertEqual(row['check_integer_with_float'], None)
        self.assertEqual(row['check_json'], ROW["json_field"])
        self.assertEqual(row['check_json_jsonpath'], "Professional and \
experienced person")

        # Checking subentries as json datasources
        self.assertEqual(row['employer.country'], 'Philippines')

        # Checking jsonpath and join
        self.assertEqual(row['autors'], 'Nigel and Evelyn')

        # Checking regex and split
        self.assertEqual(row['say_hello'], 'hello')
        self.assertEqual(row['words'], ['Words', 'words', 'words'])

        # Checking javascript func
        self.assertEqual(row['test_script'], 99)
        self.assertEqual(row['test_script_tag'], 99)

        # Checking dataFormat
        self.assertEqual(row['date'], datetime(2014, 6, 1, 13, 33))

        # Checking template
        self.assertEqual(row['template'],
                         "Greatings: hello and hi and pruvit.")

        # Checking global nested datasources
        self.assertEqual(row['application_title'], 'Application Title')
        self.assertEqual(
            mock_db.call_args_list[1][0][0],
            "SELECT title FROM applications where id==%s;" %
            ROW['application'])

    @patch('cloudml.importhandler.datasources.DbDataSource._get_iter',
           return_value=db_iter_mock())
    def test_imports_script_src(self, mock_db):
        # Checking js functions calls work from <script src=""/>
        self._extractor_script = ImportHandler(self._plan_for_script, PARAMS)
        row = self._extractor_script.next()
        self.assertTrue(mock_db.called)
        self.assertEqual(row['test_script'], 99)
        self.assertEqual(row['test_script_tag'], 99)

    @patch('cloudml.importhandler.datasources.DbDataSource._get_iter',
           return_value=db_iter_mock())
    def test_store_data_json(self, mock_db):
        self._extractor = ImportHandler(self._plan, PARAMS)
        self._extractor.store_data_json("data.json.bak")
        self.assertTrue(os.path.isfile("data.json.bak"))
        with open("data.json.bak") as fp:
            json_data = fp.read()
            data = json.loads(json_data)
            self.assertEquals(data['application_id'], 555)
        os.remove("data.json.bak")

        self._extractor.store_data_json("data.gz.bak", True)
        self.assertTrue(os.path.isfile("data.gz.bak"))
        os.remove("data.gz.bak")

    @patch('cloudml.importhandler.datasources.DbDataSource._get_iter',
           return_value=db_iter_mock())
    def test_store_data_csv(self, mock_db):
        self._extractor = ImportHandler(self._plan, PARAMS)
        self._extractor.store_data_csv("data.csv.bak")
        self.assertTrue(os.path.isfile("data.csv.bak"))
        with open("data.csv.bak") as fp:
            reader = csv.reader(fp)
            rows = [row for row in reader]
            self.assertEquals(len(rows), 2)
        os.remove("data.csv.bak")

    @patch('cloudml.importhandler.datasources.DbDataSource._get_iter',
           return_value=db_iter_mock())
    def test_store_data_csv_compressed(self, mock_db):
        self._extractor = ImportHandler(self._plan, PARAMS)
        self._extractor.store_data_csv("data.gz.bak", True)
        self.assertTrue(os.path.isfile("data.gz.bak"))
        os.remove("data.gz.bak")

    def test_validate_input_params(self):
        self._extractor = ImportHandler(self._plan, PARAMS)
        with self.assertRaisesRegexp(ImportHandlerException,
                                     "Missing input parameters"):
            self._extractor.process_input_params({'end': '2013-01-30'})

        with self.assertRaisesRegexp(ImportHandlerException,
                                     "Missing input parameters"):
            self._extractor.process_input_params({})

        with self.assertRaisesRegexp(ImportHandlerException,
                                     "Missing input parameters"):
            self._extractor.process_input_params(None)
Example #18
0
 def test_http_query(self):
     with HTTMock(http_mock):
         self._plan.entity.query = '/some/other/path.json'
         self._extractor = ImportHandler(self._plan, PARAMS)
         row = self._extractor.next()
         self.assertEqual(row['application_id'], 78910)
Example #19
0
 def test_http_datasource(self):
     with HTTMock(http_mock):
         self._extractor = ImportHandler(self._plan, PARAMS)
         row = self._extractor.next()
         self.assertEqual(row['application_id'], 123456)
Example #20
0
 def test_store_data_csv_compressed(self, mock_db):
     self._extractor = ImportHandler(self._plan, PARAMS)
     self._extractor.store_data_csv("data.gz.bak", True)
     self.assertTrue(os.path.isfile("data.gz.bak"))
     os.remove("data.gz.bak")
Example #21
0
                        trainer.test(
                            streamingiterload(test_fp,
                                              source_format=file_format),
                            test_percent)

            if args.test is not None and args.skip_tests is False:
                file_format = os.path.splitext(args.test)[1][1:]
                with open(args.test, 'r') as test_fp:
                    trainer.test(
                        streamingiterload(test_fp, source_format=file_format))

        elif args.extraction is not None:
            train_context = list_to_dict(args.train_params)
            try:
                plan = ExtractionPlan(args.extraction)
                train_handler = ImportHandler(plan, train_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in train_context.items():
                logging.info('%s --> %s' % (key, value))

            trainer.train(train_handler, test_percent)

            if args.skip_tests is False:
                if test_percent != 0:
                    if args.test_params is None:
                        test_handler = ImportHandler(plan, train_context)