def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: transformer = Transformer(args.path) except (TransformerSchemaException, IOError) as e: logging.warn('Invalid feature model: %s' % e.message) print_exception(e) return INVALID_TRANSFORMER_CONFIG try: if args.input is not None: file_format = os.path.splitext(args.input)[1][1:] with open(args.input, 'r') as train_fp: transformer.train( streamingiterload(train_fp, source_format=file_format)) elif args.extraction is not None: train_context = list_to_dict(args.train_params) try: plan = ExtractionPlan(args.extraction) train_handler = ImportHandler(plan, train_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in train_context.items(): logging.info('%s --> %s' % (key, value)) transformer.train(train_handler) else:
def testLoadMultipleCSVSingleFile(self): f = open(os.path.join(BASEDIR, 'stream.data.csv')) count = 0 csv_objects = [] for o in streamingiterload(f.readlines(), source_format='csv'): csv_objects.append(o) f.close() self.assertEquals( 4, len(csv_objects), 'Should have loaded 4 items from file (loaded %s)' % (count, )) self.assertEquals('1', csv_objects[0]['id'], 'Invalid id for first CSV object') self.assertEquals('hire', csv_objects[0]['class'], 'Invalid class for first CSV object') self.assertEquals('2', csv_objects[1]['id'], 'Invalid id for second CSV object') self.assertEquals('hire', csv_objects[1]['class'], 'Invalid class for second CSV object') self.assertEquals('3', csv_objects[2]['id'], 'Invalid id for third CSV object') self.assertEquals('nohire', csv_objects[2]['class'], 'Invalid class for third CSV object') self.assertEquals('4', csv_objects[3]['id'], 'Invalid id for fourth CSV object') self.assertEquals('hire', csv_objects[3]['class'], 'Invalid class for fourth CSV object')
def testLoadMultipleJSONSingleFile(self): f = open(os.path.join(BASEDIR, 'stream.data.json')) count = 0 json_objects = [] for o in streamingiterload(f.readlines()): json_objects.append(o) f.close() self.assertEquals( 4, len(json_objects), 'Should have loaded 4 items from file (loaded %s)' % (count, )) self.assertEquals(1, json_objects[0]['id'], 'Invalid id for first JSON object') self.assertEquals('hire', json_objects[0]['class'], 'Invalid class for first JSON object') self.assertEquals(2, json_objects[1]['id'], 'Invalid id for second JSON object') self.assertEquals('hire', json_objects[1]['class'], 'Invalid class for second JSON object') self.assertEquals(3, json_objects[2]['id'], 'Invalid id for third JSON object') self.assertEquals('nohire', json_objects[2]['class'], 'Invalid class for third JSON object') self.assertEquals(4, json_objects[3]['id'], 'Invalid id for fourth JSON object') self.assertEquals('hire', json_objects[3]['class'], 'Invalid class for fourth JSON object')
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: with open(args.path, 'r') as fp: trainer = load_trainer(fp) except (IOError, InvalidTrainerFile) as exc: logging.warn('Invalid trainer file: {0!s}'.format(exc)) print_exception(exc) return INVALID_TRAINER try: iterator = None if args.input is not None: # Read evaluation data from file. eval_fp = open(args.input, 'r') file_format = determine_data_format(args.input) iterator = streamingiterload(eval_fp, source_format=file_format) elif args.extraction is not None: # Use import handler try: eval_context = list_to_dict(args.eval_params) plan = ExtractionPlan(args.extraction) eval_handler = ImportHandler(plan, eval_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in eval_context.items(): logging.info('%s --> %s' % (key, value)) iterator = eval_handler else:
def _get_iterator(self, fmt='json'): with open(os.path.join(BASEDIR, 'transformers', 'train.data.{}'.format(fmt))) as fp: self._data = list(streamingiterload( fp.readlines(), source_format=fmt)) return self._data
def get_iterator(dirname, filename, fmt='json'): from cloudml.trainer.streamutils import streamingiterload with open(os.path.join(BASEDIR, dirname, '{0}.{1}'.format(filename, fmt))) as fp: data = list(streamingiterload(fp.readlines(), source_format=fmt)) return data
def get_iterator(self, stream): from cloudml.trainer.streamutils import streamingiterload return streamingiterload(stream, source_format=self.format)
try: trainer = Trainer(model) if args.transformer_path is not None: # defines pretrained transformers path trainer.set_transformer_getter( transformer_getter(args.transformer_path)) test_percent = parse_percent(args.test_percent) if args.input is not None: # Read training data from file file_format = determine_data_format(args.input) with open(args.input, 'r') as train_fp: logging.info("Training the model using input file dataset.") trainer.train(streamingiterload(train_fp, source_format=file_format), test_percent, store_vect_data=args.store_train_vect is not None) if args.store_train_vect is not None: logging.info('Storing train vectorized data to %s' % args.store_train_vect) trainer.vect_data2csv(args.store_train_vect) if test_percent != 0 and args.skip_tests is False \ and args.test is None: with open(args.input, 'r') as test_fp: trainer.test( streamingiterload(test_fp, source_format=file_format),