def __init__(self, mongo_connection, models_dir): self.mongo_repository = MongoRepository(mongo_connection) self.mongo_repository.load_all_documents() nlp = NlpWrapper() self.feature_processor = FeatureProcessor(nlp) self.model_facade = ModelFacade(self.mongo_repository, models_dir) self.model_facade.load_models() self.model_facade.tf2wv.load_weighted_vector()
def __init__(self, server, competition, competition_config): """ Construct the DatastreamServicer Class. Once the communication is triggered by the user, it starts publishing messages to be sent to that user and at the same time activates module for receiving the predictions. :param server: Kafka server IP address :param competition: Competition object :param competition_config: Competition configuration object """ self.server = server self.producer = ProducerToMongoSink(server) self.predictions_producer = ProducerToMongoSink(server) conf_producer = {'bootstrap.servers': server} self.kafka_producer = Producer(conf_producer) self.consumers_dict = {} self.repo = MongoRepository(_MONGO_HOST) self.competition = competition # Defining three topics: input (competition name), output (competition name + predictions) # data (competition name + data) self.input_topic = competition.name.lower().replace(" ", "") self.output_topic = competition.name.lower().replace(" ", "") + 'data' self.spark_topic = competition.name.lower().replace(" ", "") + 'predictions' try: # create data_object dictionary with following fields: competition_id, dataset data_object = {} data_object['competition_id'] = str(self.competition.competition_id) data_object['dataset'] = [] # Insert document in mongo repository with db name: data, collection name: data self.repo.insert_document('data', 'data', data_object) except Exception as e: pass # Import the right gRPC module # file_path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2.py pb2_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name, 'file_pb2.py') # grpc file path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2_grpc.py pb2_grpc_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name, 'file_pb2_grpc.py') # import parent modules self.file_pb2 = imp.load_source('file_pb2', pb2_file_path) self.file_pb2_grpc = imp.load_source('file_pb2_grpc', pb2_grpc_file_path) # import classes self.DataStreamer = imp.load_source('file_pb2_grpc.DataStreamerServicer', pb2_grpc_file_path) self.Message = imp.load_source('file_pb2.Message', pb2_file_path) self.targets = [] for key in competition_config.keys(): y = str(key).replace(' ', '') # Key self.targets.append(y) self.__bases__ = (self.DataStreamer,) # ??
def __init__(self, kafka_server, prediction_topic, golden_topic, measures_topic, competition, configuration): self.consumer = Consumer({ 'group.id': 'spark_measures', 'bootstrap.servers': kafka_server, 'session.timeout.ms': competition.initial_training_time * 10000, 'auto.offset.reset': 'earliest', 'allow.auto.create.topics': True }) self.consumer.subscribe( [prediction_topic, golden_topic, measures_topic]) self.mongo_repository = MongoRepository(_MONGO_HOST) self.db_evaluations = self.mongo_repository.client[ 'evaluation_measures'] self.competition = competition self.config = configuration self.prediction_topic = prediction_topic self.golden_topic = golden_topic self.measures_topic = measures_topic self.db_data = self.mongo_repository.client['data']
def __init__(self, kafka_server, topic, competition): conf = { 'bootstrap.servers': kafka_server, 'group.id': 'data', 'session.timeout.ms': competition.initial_training_time * 10000, 'auto.offset.reset': 'earliest' } self.consumer = Consumer(conf) self.consumer.subscribe([topic]) self.mongo_repository = MongoRepository(_MONGO_HOST)
import yaml from repository import MongoRepository if __name__ == '__main__': config = yaml.safe_load(open("config.yml")) data_dir = config['data_dir'] mongo_connection = config['mongo_connection'] mongo_repository = MongoRepository(mongo_connection) with open('data/questions.txt', 'w') as f: f.writelines( mongo_repository.iterate_questions( collection=mongo_repository.questions, separator=True)) with open('data/preprocessed_questions.txt', 'w') as f: f.writelines( mongo_repository.iterate_questions( collection=mongo_repository.preprocessed_questions, separator=True)) with open('data/processed_questions.txt', 'w') as f: f.writelines( mongo_repository.iterate_questions( collection=mongo_repository.processed_questions, separator=True))
def _create_evaluation_spark(kafka_server, competition, competition_config): """ Creates new Spark session to handle given competition. Spark program consumes the messages from Kafka to do online evaluation and then store the metrics, predictions and original instances in MongoDB. :param kafka_server: IP address and port of the Kafka server to read from and write to :param competition: Competition object :param competition_config: Competition config :return: """ # Create Spark Session for online evaluation job spark_context = SparkSession \ .builder \ .appName("Kafka_structured_streaming") \ .master(spark_master) \ .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0') \ .config('spark.driver.host', SPARK_DRIVER_HOST) \ .config('spark.driver.port', SPARK_DRIVER_PORT) \ .config('spark.blockManager.port', SPARK_BLOCKMANAGER_PORT) \ .config('spark.executor.memory', '2g') \ .config('spark.network.timeout', 800) \ .config('spark.cleaner.referenceTracking.cleanCheckpoints', "true") \ .config('spark.shuffle.compress', 'true') \ .config('spark.checkpoint.compress', 'true') \ .config('spark.sql.shuffle.partitions', 60) \ .getOrCreate() mongo = MongoRepository(_MONGO_HOST) db = mongo.client['evaluation_measures'] collection = db['standard_measures'] measures = collection.find({}) regression_measures = [] classification_measures = [] for m in measures: if m['type'] == 'regression': regression_measures.append(m['name']) if m['type'] == 'classification': classification_measures.append(m['name']) targets = [] for key in competition_config.keys(): y = str(key).replace(' ', '') # Key targets.append(y) # Fields for published message train_schema = StructType() \ .add("Deadline", StringType(), False) \ .add("Released", StringType(), False) \ .add("competition_id", IntegerType(), False) \ .add("rowID", IntegerType(), False) # Fields for prediction prediction_schema = StructType() \ .add("rowID", IntegerType(), False) \ .add("submitted_on", StringType(), False) \ .add("prediction_competition_id", IntegerType(), False) \ .add("user_id", IntegerType(), False) for target in targets: # Decide weather it is regression or classification for measure in competition_config[target.replace(" ", "")]: if measure in regression_measures: regression = True if target not in train_schema.fieldNames(): train_schema.add(target, StringType(), False) if target not in prediction_schema.fieldNames(): prediction_schema.add("prediction_" + target, FloatType(), False) elif measure in classification_measures: regression = False if target not in train_schema.fieldNames(): train_schema.add(target, StringType(), False) if target not in prediction_schema.fieldNames(): prediction_schema.add("prediction_" + target, StringType(), False) # Time window duration for watermarking window_duration = str( competition.predictions_time_interval) + " " + "seconds" prediction_window_duration = str( 12 * competition.predictions_time_interval) + " " + "seconds" # Creating lists of column names which wiil be used later during calculations and transformations target_columns = [] # Target column names prediction_target_columns = [ ] # target column names in prediction messages, they have prefix "prediction_" measure_columns = [] # Measure column names, for every target sum_columns = [ ] # Column names after aggregation, automatically they will be named "sum(*)" columns_to_sum = ["latency", "num_submissions", "penalized", "total_num"] # Column names on which we should # apply aggregations for target in targets: target_col = target prediction_target_col = "prediction_" + target.replace(" ", "") for measure in competition_config[target.replace(" ", "")]: # measure column example: "MAPE_Valeurs" measure_col = str(measure) + "_" + target.replace(" ", "") # sum column example: "sum(MAPE_Valeurs)" sum_col = "sum(" + str(measure) + "_" + target.replace(" ", "") + ")" measure_columns.append(measure_col) sum_columns.append(sum_col) if measure_col not in columns_to_sum: columns_to_sum.append(measure_col) # columns_to sum = ["latency", "num_submissions", "penalized", "MAPE_Valeurs"] target_columns.append(target_col) prediction_target_columns.append(prediction_target_col) checkpoint_locations = [ "/tmp/" + competition.name.lower().replace(" ", "") + "prediction_checkpoint", "/tmp/" + competition.name.lower().replace(" ", "") + "training_checkpoint", "/tmp/" + competition.name.lower().replace(" ", "") + "measure_checkpoint" ] sparkEvaluator.evaluate( spark_context=spark_context, kafka_broker=kafka_server, competition=competition, competition_config=competition_config, window_duration=window_duration, prediction_window_duration=prediction_window_duration, train_schema=train_schema, prediction_schema=prediction_schema, columns_to_sum=columns_to_sum, checkpoints=checkpoint_locations, targets=targets) spark_context.stop()
class QueryExecutor: def __init__(self, mongo_connection, models_dir): self.mongo_repository = MongoRepository(mongo_connection) self.mongo_repository.load_all_documents() nlp = NlpWrapper() self.feature_processor = FeatureProcessor(nlp) self.model_facade = ModelFacade(self.mongo_repository, models_dir) self.model_facade.load_models() self.model_facade.tf2wv.load_weighted_vector() def process_input(self, text): answers = self.retrieve_answers(text, threshold=0.78) scores, token_map, tokens_not_found = answers["scores"], answers[ "token_map"], answers["tokens_not_found"] self.print_documents_for(scores) self.print_similar_words(token_map) def retrieve_answers(self, text, threshold=0.78, topn=20): logging.info("Retrieving answers for {}".format(text)) text = self.feature_processor(text) tokens = text.lower().split() logging.info("Tokens after preprocessing : {}".format(tokens)) trigrams, scores = self.model_facade.similar_doc(tokens) if (len(scores) > 0): token_map = self.model_facade.doc2vecFacade.retrieve_similar_words( trigrams, threshold=threshold, topn=topn) tokens_not_found = [ word for word in trigrams if word not in token_map ] return { "scores": scores, "token_map": token_map, "tokens_not_found": tokens_not_found } else: return { "scores": [], "token_map": {}, "tokens_not_found": trigrams } def retrieve_similar_words(self, tokens, threshold=0.78, topn=30): token_map = self.model_facade.doc2vecFacade.retrieve_similar_words( tokens, threshold=threshold, topn=topn) return token_map def retrieve_documents(self, all_scores, page_id): all_documents = [] scores = all_scores[PER_PAGE * page_id:PER_PAGE * (page_id + 1)] for id, score, tfidf_score, wv_score in scores: mongo_document = self.mongo_repository.get_preprocessed_question( id) lines_answer = mongo_document.split('\n') all_documents.append({ "question": lines_answer[0], "answer": "\n".join(lines_answer[1:]), "score": score, "tfidf_score": tfidf_score, "wv_score": wv_score }) return all_documents def print_documents_for(self, scores): for id, score in scores: mongo_document = self.mongo_repository.get_preprocessed_question( id) print("======" + str(id) + "============================") print(mongo_document) def print_similar_words(self, tokenmap): print(" ============= SIMILAR WORDS ======W===============") for key, values in tokenmap.items(): print(key, ", ".join([v[0] for v in values]))
class DataStreamerServicer: """ Datastream Servicer handles the communication with users. Sends the datastream records and handles the predictions that are sent by users. It creates the topics. Starts Kafka producers and loads the data structures for communication with users, generated from .proto file. """ def __init__(self, server, competition, competition_config): """ Construct the DatastreamServicer Class. Once the communication is triggered by the user, it starts publishing messages to be sent to that user and at the same time activates module for receiving the predictions. :param server: Kafka server IP address :param competition: Competition object :param competition_config: Competition configuration object """ self.server = server self.producer = ProducerToMongoSink(server) self.predictions_producer = ProducerToMongoSink(server) conf_producer = {'bootstrap.servers': server} self.kafka_producer = Producer(conf_producer) self.consumers_dict = {} self.repo = MongoRepository(_MONGO_HOST) self.competition = competition # Defining three topics: input (competition name), output (competition name + predictions) # data (competition name + data) self.input_topic = competition.name.lower().replace(" ", "") self.output_topic = competition.name.lower().replace(" ", "") + 'data' self.spark_topic = competition.name.lower().replace(" ", "") + 'predictions' try: # create data_object dictionary with following fields: competition_id, dataset data_object = {} data_object['competition_id'] = str(self.competition.competition_id) data_object['dataset'] = [] # Insert document in mongo repository with db name: data, collection name: data self.repo.insert_document('data', 'data', data_object) except Exception as e: pass # Import the right gRPC module # file_path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2.py pb2_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name, 'file_pb2.py') # grpc file path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2_grpc.py pb2_grpc_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name, 'file_pb2_grpc.py') # import parent modules self.file_pb2 = imp.load_source('file_pb2', pb2_file_path) self.file_pb2_grpc = imp.load_source('file_pb2_grpc', pb2_grpc_file_path) # import classes self.DataStreamer = imp.load_source('file_pb2_grpc.DataStreamerServicer', pb2_grpc_file_path) self.Message = imp.load_source('file_pb2.Message', pb2_file_path) self.targets = [] for key in competition_config.keys(): y = str(key).replace(' ', '') # Key self.targets.append(y) self.__bases__ = (self.DataStreamer,) # ?? def sendData(self, request_iterator, context): """ After the user has initialized the communication with the server. It checks user's credentials and starts sending the data records. :param request_iterator: Sent by the user through gRPC/Protobuf protocol :param context: data :return: """ _SUBSCRIPTION_REPO = SubscriptionRepository(_SQL_HOST, _SQL_DBNAME) _USER_REPO = UserRepository(_SQL_HOST, _SQL_DBNAME) _COMPETITION_REPO = CompetitionRepository(_SQL_HOST, _SQL_DBNAME) metadata = context.invocation_metadata() metadata = dict(metadata) token = metadata['authorization'] user_id = metadata['user_id'] competition_code = metadata['competition_id'] user = _USER_REPO.get_user_by_email(user_id) _USER_REPO.cleanup() if user is None: context.set_code(grpc.StatusCode.PERMISSION_DENIED) context.set_details('You are not registered, please register on the website') return self.file_pb2.Message() competition = _COMPETITION_REPO.get_competition_by_code(competition_code) _COMPETITION_REPO.cleanup() if competition is None: context.set_code(grpc.StatusCode.INVALID_ARGUMENT) context.set_details('Unknown competition, please refer to the website') return self.file_pb2.Message() # TODO : for Subscription Data subscription = _SUBSCRIPTION_REPO.get_subscription(competition.competition_id, user.user_id) _SUBSCRIPTION_REPO.cleanup() if subscription is None: # TODO : Should close connection context.set_code(grpc.StatusCode.PERMISSION_DENIED) context.set_details('You are not allowed to participate, please subscribe to the competition on website') return self.file_pb2.Message() # TODO : check secret token decoded_token = decode_subscription_token(token) if decoded_token is None: print('Wrong token') # TODO : should close connection context.set_code(grpc.StatusCode.UNAUTHENTICATED) context.set_details('Please check your authentication credentials, Wrong Token!') return self.file_pb2.Message() decoded_token = decoded_token[1] token_competition_id = decoded_token['competition_id'] token_user_id = decoded_token['user_id'] if int(token_competition_id) != int(competition.competition_id) or token_user_id != user_id: # TODO : should close channel print('Using wrong token for this competition') context.set_code(grpc.StatusCode.UNAUTHENTICATED) context.set_details('Please check your authentication token, the secret key does not match') return self.file_pb2.Message() end_date = self.competition.end_date + 5 * datetime.timedelta(seconds=self.competition.predictions_time_interval) if user_id in self.consumers_dict: consumer = self.consumers_dict[user_id] else: consumer = Consumer({'group.id': user_id, 'bootstrap.servers': self.server, 'session.timeout.ms': competition.initial_training_time * 10000, 'auto.offset.reset': 'latest'}) # 172.22.0.2:9092 consumer.subscribe([self.input_topic]) self.consumers_dict[user_id] = consumer try: stop_thread = False t = threading.Thread(target=receive_predictions, kwargs={'predictions': request_iterator, 'competition_id': self.competition.competition_id, 'user_id': user.user_id, 'end_date': end_date, 'kafka_producer': self.kafka_producer, 'spark_topic': self.spark_topic, 'targets': self.targets, 'stop': lambda: stop_thread}) # use default name t.start() except Exception as e: print(str(e)) while context.is_active(): message = consumer.poll(timeout=0) if message is None: continue else: try: values = orjson.loads(message.value()) json_string = json.dumps(values, default=json_util.default) message = self.file_pb2.Message() final_message = json_format.Parse(json_string, message, ignore_unknown_fields=True) time.sleep(0.01) if context.is_active(): yield message else: break except Exception as e: pass if datetime.datetime.now() > end_date: break logging.debug("disconnect") stop_thread = True t.join()
parser.add_argument('--append', dest="append", action='store_true') parser.set_defaults(import_qa=False, process=False, print_files=False, model=False, append=False) args = parser.parse_args() config = yaml.safe_load(open("../config.yml")) model_dir = config['models_dir'] output_dir = config['output_dir'] os.makedirs(model_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True) data_dirs = config['data_dirs'] mongo_connection = config['mongo_connection'] mongo_repository = MongoRepository(mongo_connection) process_documents(data_dirs, output_dir, do_import=args.import_qa, do_process=args.process, do_print_files=args.print_files, append=args.append) if args.model: logging.info("Started creation of model...") model_facade = ModelFacade(mongo_repository, model_dir) model_facade.create_model() logging.info("Finished started creation of model...")
import yaml from language import NlpWrapper from repository import MongoRepository from feature_extract import FeatureProcessor if __name__ == '__main__': config = yaml.safe_load(open("config.yml")) data_dir = config['data_dir'] mongo_connection = config['mongo_connection'] mongo_repository = MongoRepository(mongo_connection) nlp = NlpWrapper() feature_processor = FeatureProcessor(nlp) mongo_repository.process_questions( source_collection=mongo_repository.preprocessed_questions, target_collection=mongo_repository.processed_questions, processor=feature_processor)
import yaml from repository import MongoRepository if __name__ == '__main__': config = yaml.safe_load(open("config.yml")) data_dir = config['data_dir'] mongo_connection = config['mongo_connection'] mongo_repository = MongoRepository(mongo_connection) mongo_repository.import_questions(data_dir)
try: _SQL_DBNAME = os.environ['SQL_DBNAME'] except Exception: _SQL_DBNAME = config['SQL_DBNAME'] try: _MONGO_HOST = os.environ['MONGO_HOST'] except Exception: _MONGO_HOST = config['MONGO_HOST'] _COMPETITION_REPO = CompetitionRepository(_SQL_HOST, _SQL_DBNAME) _DATASTREAM_REPO = DatastreamRepository(_SQL_HOST, _SQL_DBNAME) _USER_REPO = UserRepository(_SQL_HOST, _SQL_DBNAME) _SUBSCRIPTION_REPO = SubscriptionRepository(_SQL_HOST, _SQL_DBNAME) _MONGO_REPO = MongoRepository(_MONGO_HOST) # Standard evaluation measures, should be written in MongoDB if they don't exist there already standard_measures = [{ 'id': 1, 'name': 'MAPE', 'type': 'regression' }, { 'id': 2, 'name': 'MSE', 'type': 'regression' }, { 'id': 3, 'name': 'MAE', 'type': 'regression' }, {