예제 #1
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        transformer = Transformer(args.path)
    except (TransformerSchemaException, IOError) as e:
        logging.warn('Invalid feature model: %s' % e.message)
        print_exception(e)
        return INVALID_TRANSFORMER_CONFIG

    try:
        if args.input is not None:
            file_format = os.path.splitext(args.input)[1][1:]
            with open(args.input, 'r') as train_fp:
                transformer.train(
                    streamingiterload(train_fp, source_format=file_format))
        elif args.extraction is not None:
            train_context = list_to_dict(args.train_params)

            try:
                plan = ExtractionPlan(args.extraction)
                train_handler = ImportHandler(plan, train_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in train_context.items():
                logging.info('%s --> %s' % (key, value))
            transformer.train(train_handler)
        else:
예제 #2
0
    def testLoadMultipleCSVSingleFile(self):
        f = open(os.path.join(BASEDIR, 'stream.data.csv'))
        count = 0
        csv_objects = []
        for o in streamingiterload(f.readlines(), source_format='csv'):
            csv_objects.append(o)

        f.close()
        self.assertEquals(
            4, len(csv_objects),
            'Should have loaded 4 items from file (loaded %s)' % (count, ))
        self.assertEquals('1', csv_objects[0]['id'],
                          'Invalid id for first CSV object')
        self.assertEquals('hire', csv_objects[0]['class'],
                          'Invalid class for first CSV object')

        self.assertEquals('2', csv_objects[1]['id'],
                          'Invalid id for second CSV object')
        self.assertEquals('hire', csv_objects[1]['class'],
                          'Invalid class for second CSV object')

        self.assertEquals('3', csv_objects[2]['id'],
                          'Invalid id for third CSV object')
        self.assertEquals('nohire', csv_objects[2]['class'],
                          'Invalid class for third CSV object')

        self.assertEquals('4', csv_objects[3]['id'],
                          'Invalid id for fourth CSV object')
        self.assertEquals('hire', csv_objects[3]['class'],
                          'Invalid class for fourth CSV object')
예제 #3
0
    def testLoadMultipleJSONSingleFile(self):
        f = open(os.path.join(BASEDIR, 'stream.data.json'))
        count = 0
        json_objects = []
        for o in streamingiterload(f.readlines()):
            json_objects.append(o)

        f.close()
        self.assertEquals(
            4, len(json_objects),
            'Should have loaded 4 items from file (loaded %s)' % (count, ))
        self.assertEquals(1, json_objects[0]['id'],
                          'Invalid id for first JSON object')
        self.assertEquals('hire', json_objects[0]['class'],
                          'Invalid class for first JSON object')

        self.assertEquals(2, json_objects[1]['id'],
                          'Invalid id for second JSON object')
        self.assertEquals('hire', json_objects[1]['class'],
                          'Invalid class for second JSON object')

        self.assertEquals(3, json_objects[2]['id'],
                          'Invalid id for third JSON object')
        self.assertEquals('nohire', json_objects[2]['class'],
                          'Invalid class for third JSON object')

        self.assertEquals(4, json_objects[3]['id'],
                          'Invalid id for fourth JSON object')
        self.assertEquals('hire', json_objects[3]['class'],
                          'Invalid class for fourth JSON object')
예제 #4
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        with open(args.path, 'r') as fp:
            trainer = load_trainer(fp)
    except (IOError, InvalidTrainerFile) as exc:
        logging.warn('Invalid trainer file: {0!s}'.format(exc))
        print_exception(exc)
        return INVALID_TRAINER

    try:
        iterator = None
        if args.input is not None:
            # Read evaluation data from file.
            eval_fp = open(args.input, 'r')
            file_format = determine_data_format(args.input)
            iterator = streamingiterload(eval_fp, source_format=file_format)
        elif args.extraction is not None:
            # Use import handler
            try:
                eval_context = list_to_dict(args.eval_params)
                plan = ExtractionPlan(args.extraction)
                eval_handler = ImportHandler(plan, eval_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in eval_context.items():
                logging.info('%s --> %s' % (key, value))

            iterator = eval_handler
        else:
예제 #5
0
 def _get_iterator(self, fmt='json'):
     with open(os.path.join(BASEDIR, 'transformers',
                            'train.data.{}'.format(fmt))) as fp:
         self._data = list(streamingiterload(
             fp.readlines(), source_format=fmt))
     return self._data
예제 #6
0
def get_iterator(dirname, filename, fmt='json'):
    from cloudml.trainer.streamutils import streamingiterload
    with open(os.path.join(BASEDIR, dirname, '{0}.{1}'.format(filename,
                                                              fmt))) as fp:
        data = list(streamingiterload(fp.readlines(), source_format=fmt))
    return data
예제 #7
0
 def get_iterator(self, stream):
     from cloudml.trainer.streamutils import streamingiterload
     return streamingiterload(stream, source_format=self.format)
예제 #8
0
    try:
        trainer = Trainer(model)

        if args.transformer_path is not None:
            # defines pretrained transformers path
            trainer.set_transformer_getter(
                transformer_getter(args.transformer_path))

        test_percent = parse_percent(args.test_percent)
        if args.input is not None:
            # Read training data from file
            file_format = determine_data_format(args.input)
            with open(args.input, 'r') as train_fp:
                logging.info("Training the model using input file dataset.")
                trainer.train(streamingiterload(train_fp,
                                                source_format=file_format),
                              test_percent,
                              store_vect_data=args.store_train_vect
                              is not None)

                if args.store_train_vect is not None:
                    logging.info('Storing train vectorized data to %s' %
                                 args.store_train_vect)
                    trainer.vect_data2csv(args.store_train_vect)

                if test_percent != 0 and args.skip_tests is False \
                   and args.test is None:
                    with open(args.input, 'r') as test_fp:
                        trainer.test(
                            streamingiterload(test_fp,
                                              source_format=file_format),