def main(): parser = argparse.ArgumentParser(description='Learner') parser.add_argument('-m', '--module_properties', help='module properties file', required=True) parser.add_argument('-d', '--db_properties', help='database connection properties file', required=True) parser.add_argument('-s', '--sql_statements', help='sql statements file', required=True) args = parser.parse_args() module_properties = Utils.read_properties(args.module_properties) db_properties = Utils.read_properties(args.db_properties) sql_statements = Utils.read_properties(args.sql_statements) dao = DB(db_properties, sql_statements) feature_generator = FeatureGenerator(dao, module_properties) feature_generator.run()
def main(): parser = argparse.ArgumentParser(description='Learner') parser.add_argument('-m', '--module_properties', help='module properties file', required=True) parser.add_argument('-d', '--db_properties', help='database connection properties file', required=True) parser.add_argument('-s', '--sql_statements', help='sql statements file', required=True) args = parser.parse_args() module_properties = Utils.read_properties(args.module_properties) db_properties = Utils.read_properties(args.db_properties) sql_statements = Utils.read_properties(args.sql_statements) logging.basicConfig(filename=module_properties['log_file'], filemode='w', format='%(message)s', level=logging.DEBUG) dao = DB(db_properties, sql_statements) learner = LearnerFactory.new_learner(module_properties, dao) learner.run()
def main(): parser = argparse.ArgumentParser(description='Registry') parser.add_argument('-m', '--module_properties', help='module properties file', required=True) args = parser.parse_args() module_properties = Utils.read_properties(args.module_properties) registry = Registry(module_properties) registry.run()
def main(): parser = argparse.ArgumentParser(description='Deployer') parser.add_argument('-m', '--module_properties', help='module properties file', required=True) args = parser.parse_args() module_properties = Utils.read_properties(args.module_properties) logging.basicConfig(filename=module_properties['log_file'], filemode='w', format='%(message)s', level=logging.DEBUG) deployer = Deployer(module_properties) deployer.run()
def parse_line(self, line): # remove \n line = line.splitlines()[0] timestamp, rest = line.split('\t') fields = rest.split("|") clicked = int(fields[0]) # depth of the session depth = int(fields[1]) position = int(fields[2]) userid = int(fields[3]) # user gender indicator (-1 for male, 1 for female) gender = int(fields[4]) if gender != 0: gender = int((gender - 1.5) * 2) # user age indicator: # '1' for (0, 12], # '2' for (12, 18], # '3' for (18, 24], # '4' for (24, 30], # '5' for (30, 40], and # '6' for greater than 40. age = int(fields[5]) # list of token ids tokens = [int(xx) for xx in fields[6].split(",")] # hash tokens hashed_tokens = {} for i, token in enumerate(tokens): Utils.update_feature(token, 1, hashed_tokens, self._m) if userid > 0: user_token = str(token) + "_" + str(userid) Utils.update_feature(user_token, 1, hashed_tokens, self._m) hashed_tokens_arr = [] for key in sorted(hashed_tokens): hashed_tokens_arr.append('%s:%s' % (key + self._regular_features, hashed_tokens[key])) tokens_as_str = ' '.join(token_f for token_f in hashed_tokens_arr) # timestamp, label, depth, position, gender, age, categorical_features parsed_line = '%s %s 0:%s 1:%s 2:%s 3:%s %s' % (timestamp, clicked, depth, position, gender, age, tokens_as_str) return parsed_line
def __init__(self, module_properties, dao, clf): self._dao = dao self._id = module_properties['id'] self._warmup_examples = module_properties['warmup_examples'] self._checkpoint = module_properties['checkpoint'] self._is_prequential = True if module_properties['eval_mode'] == 'prequential' else False self._parser = ParserFactory.new_parser(module_properties) self._evaluator = EvaluatorFactory.new_evaluator(module_properties) self._offline_test = None if self._is_prequential == False: self._offline_test = self._parser.parse(module_properties['offline_test']) self._classes = np.array(map(int, module_properties['classes'].split(','))) self._server_port = module_properties['server_port'] ##### Recovery and adding learners on the fly ######### self._clf, timestamp = self._dao.get_model(self._id) self._checkpointed = None # if there was a checkpoint, then see if there are some historical points beyond that and train if self._clf: self._checkpointed = True examples = self._dao.get_examples_greater_than(timestamp) if examples: print 'catching up checkpointed model with historical points...' X, y, timestamps = self._parser.parse_feature(examples) self._clf.partial_fit(X, y) else: print 'no historical points to catch up the checkpointed model' # will use the last metric saved else: self._checkpointed = False self._clf = clf examples = self._dao.get_examples() if examples: print 'catching up new model with historical points' X, y, timestamps = self._parser.parse_feature(examples) self._clf.partial_fit(X, y, self._classes) else: print 'no historical points to catch up the new model' ####################################### self._registry = RegistryClient(module_properties['registry']) hostname = socket.gethostname() address = Utils.get_address(hostname, self._server_port) self._stream_client_address = self._registry.reg(ComponentType.LEARNER, address)[0] self._handler = LearnerHandler(self._stream_client_address, self._registry, self._parser, self._evaluator, self._dao, self._clf, self._classes, self._warmup_examples, self._id, self._checkpoint, self._is_prequential, self._checkpointed, self._offline_test) self._processor = StreamService.Processor(self._handler) self._stream_server = Server(self._processor, self._server_port, module_properties['multi_threading'])
def __init__(self, dao, module_properties): self._dao = dao self._parser = ParserFactory.new_parser(module_properties) self._historical_batches = module_properties["historical_batches"] self._registry = RegistryClient(module_properties["registry"]) self._server_port = module_properties["server_port"] hostname = socket.gethostname() address = Utils.get_address(hostname, self._server_port) self._stream_clients_addresses = self._registry.reg(ComponentType.FEATGEN, address) self._processor = StreamService.Processor( FeatureGeneratorHandler( self._registry, self._parser, self._dao, self._historical_batches, self._stream_clients_addresses ) ) self._stream_server = Server(self._processor, self._server_port, module_properties["multi_threading"])
def get_model(self, id): print 'retrieving model %s' % id cur = self._conn.cursor() cur.execute(self._sql['get_model'], {'id' : id}) tupl = cur.fetchone() model = None timestamp = None if tupl: timestamp = tupl[0] model = Utils.deserialize(tupl[1]) print 'retrieved model %s' % id else: print 'not retrieved model %s' % id cur.close() return model, timestamp
def run(self): hostname = socket.gethostname() address = Utils.get_address(hostname, self._server_port) self._registry.reg(ComponentType.DEPLOYER, address) self._stream_server.start()
def __init__(self, address): host, port = Utils.get_address_components(address) self._host = host self._port = port self._address = address
def update_model(self, id, timestamp, model): print 'updating model %s' % id cur = self._conn.cursor() cur.execute(self._sql['update_model'], {'id': id, 'timestamp': timestamp, 'data': Utils.serialize(model), 'cond' : id}) self._conn.commit() cur.close()