Example #1
0
 def __init__(self, mongo_connection, models_dir):
     self.mongo_repository = MongoRepository(mongo_connection)
     self.mongo_repository.load_all_documents()
     nlp = NlpWrapper()
     self.feature_processor = FeatureProcessor(nlp)
     self.model_facade = ModelFacade(self.mongo_repository, models_dir)
     self.model_facade.load_models()
     self.model_facade.tf2wv.load_weighted_vector()
Example #2
0
    def __init__(self, server, competition, competition_config):
        """
        Construct the DatastreamServicer Class. Once the communication is triggered by the user, it starts publishing
        messages to be sent to that user and at the same time activates module for receiving the predictions.

        :param server: Kafka server IP address
        :param competition: Competition object
        :param competition_config: Competition configuration object
        """
        self.server = server
        self.producer = ProducerToMongoSink(server)
        self.predictions_producer = ProducerToMongoSink(server)
        conf_producer = {'bootstrap.servers': server}
        self.kafka_producer = Producer(conf_producer)
        self.consumers_dict = {}

        self.repo = MongoRepository(_MONGO_HOST)
        self.competition = competition

        # Defining three topics: input (competition name), output (competition name + predictions)
        # data (competition name + data)
        self.input_topic = competition.name.lower().replace(" ", "")
        self.output_topic = competition.name.lower().replace(" ", "") + 'data'
        self.spark_topic = competition.name.lower().replace(" ", "") + 'predictions'

        try:
            # create data_object dictionary with following fields: competition_id, dataset
            data_object = {}
            data_object['competition_id'] = str(self.competition.competition_id)
            data_object['dataset'] = []
            # Insert document in mongo repository with db name: data, collection name: data
            self.repo.insert_document('data', 'data', data_object)
        except Exception as e:
            pass

        # Import the right gRPC module
        # file_path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2.py
        pb2_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name, 'file_pb2.py')
        # grpc file path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2_grpc.py
        pb2_grpc_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name,
                                          'file_pb2_grpc.py')
        # import parent modules
        self.file_pb2 = imp.load_source('file_pb2', pb2_file_path)
        self.file_pb2_grpc = imp.load_source('file_pb2_grpc', pb2_grpc_file_path)
        # import classes
        self.DataStreamer = imp.load_source('file_pb2_grpc.DataStreamerServicer', pb2_grpc_file_path)
        self.Message = imp.load_source('file_pb2.Message', pb2_file_path)

        self.targets = []

        for key in competition_config.keys():
            y = str(key).replace(' ', '')  # Key
            self.targets.append(y)

        self.__bases__ = (self.DataStreamer,)  # ??
Example #3
0
 def __init__(self, kafka_server, prediction_topic, golden_topic,
              measures_topic, competition, configuration):
     self.consumer = Consumer({
         'group.id':
         'spark_measures',
         'bootstrap.servers':
         kafka_server,
         'session.timeout.ms':
         competition.initial_training_time * 10000,
         'auto.offset.reset':
         'earliest',
         'allow.auto.create.topics':
         True
     })
     self.consumer.subscribe(
         [prediction_topic, golden_topic, measures_topic])
     self.mongo_repository = MongoRepository(_MONGO_HOST)
     self.db_evaluations = self.mongo_repository.client[
         'evaluation_measures']
     self.competition = competition
     self.config = configuration
     self.prediction_topic = prediction_topic
     self.golden_topic = golden_topic
     self.measures_topic = measures_topic
     self.db_data = self.mongo_repository.client['data']
Example #4
0
 def __init__(self, kafka_server, topic, competition):
     conf = {
         'bootstrap.servers': kafka_server,
         'group.id': 'data',
         'session.timeout.ms': competition.initial_training_time * 10000,
         'auto.offset.reset': 'earliest'
     }
     self.consumer = Consumer(conf)
     self.consumer.subscribe([topic])
     self.mongo_repository = MongoRepository(_MONGO_HOST)
Example #5
0
import yaml
from repository import MongoRepository

if __name__ == '__main__':
    config = yaml.safe_load(open("config.yml"))
    data_dir = config['data_dir']
    mongo_connection = config['mongo_connection']
    mongo_repository = MongoRepository(mongo_connection)

    with open('data/questions.txt', 'w') as f:
        f.writelines(
            mongo_repository.iterate_questions(
                collection=mongo_repository.questions, separator=True))

    with open('data/preprocessed_questions.txt', 'w') as f:
        f.writelines(
            mongo_repository.iterate_questions(
                collection=mongo_repository.preprocessed_questions,
                separator=True))

    with open('data/processed_questions.txt', 'w') as f:
        f.writelines(
            mongo_repository.iterate_questions(
                collection=mongo_repository.processed_questions,
                separator=True))
Example #6
0
def _create_evaluation_spark(kafka_server, competition, competition_config):
    """
    Creates new Spark session to handle given competition. Spark program consumes the messages from Kafka
    to do online evaluation and then store the metrics, predictions and original instances in MongoDB.

    :param kafka_server: IP address and port of the Kafka server to read from and write to
    :param competition: Competition object
    :param competition_config: Competition config
    :return:
    """
    # Create Spark Session for online evaluation job
    spark_context = SparkSession \
        .builder \
        .appName("Kafka_structured_streaming") \
        .master(spark_master) \
        .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0') \
        .config('spark.driver.host', SPARK_DRIVER_HOST) \
        .config('spark.driver.port', SPARK_DRIVER_PORT) \
        .config('spark.blockManager.port', SPARK_BLOCKMANAGER_PORT) \
        .config('spark.executor.memory', '2g') \
        .config('spark.network.timeout', 800) \
        .config('spark.cleaner.referenceTracking.cleanCheckpoints', "true") \
        .config('spark.shuffle.compress', 'true') \
        .config('spark.checkpoint.compress', 'true') \
        .config('spark.sql.shuffle.partitions', 60) \
        .getOrCreate()

    mongo = MongoRepository(_MONGO_HOST)

    db = mongo.client['evaluation_measures']
    collection = db['standard_measures']
    measures = collection.find({})
    regression_measures = []
    classification_measures = []
    for m in measures:
        if m['type'] == 'regression':
            regression_measures.append(m['name'])
        if m['type'] == 'classification':
            classification_measures.append(m['name'])

    targets = []

    for key in competition_config.keys():
        y = str(key).replace(' ', '')  # Key
        targets.append(y)

    # Fields for published message
    train_schema = StructType() \
        .add("Deadline", StringType(), False) \
        .add("Released", StringType(), False) \
        .add("competition_id", IntegerType(), False) \
        .add("rowID", IntegerType(), False)
    # Fields for prediction
    prediction_schema = StructType() \
        .add("rowID", IntegerType(), False) \
        .add("submitted_on", StringType(), False) \
        .add("prediction_competition_id", IntegerType(), False) \
        .add("user_id", IntegerType(), False)

    for target in targets:
        # Decide weather it is regression or classification
        for measure in competition_config[target.replace(" ", "")]:
            if measure in regression_measures:
                regression = True
                if target not in train_schema.fieldNames():
                    train_schema.add(target, StringType(), False)
                if target not in prediction_schema.fieldNames():
                    prediction_schema.add("prediction_" + target, FloatType(),
                                          False)
            elif measure in classification_measures:
                regression = False
                if target not in train_schema.fieldNames():
                    train_schema.add(target, StringType(), False)
                if target not in prediction_schema.fieldNames():
                    prediction_schema.add("prediction_" + target, StringType(),
                                          False)

    # Time window duration for watermarking
    window_duration = str(
        competition.predictions_time_interval) + " " + "seconds"
    prediction_window_duration = str(
        12 * competition.predictions_time_interval) + " " + "seconds"

    # Creating lists of column names which wiil be used later during calculations and transformations
    target_columns = []  # Target column names
    prediction_target_columns = [
    ]  # target column names in prediction messages, they have prefix "prediction_"
    measure_columns = []  # Measure column names, for every target
    sum_columns = [
    ]  # Column names after aggregation, automatically they will be named "sum(*)"
    columns_to_sum = ["latency", "num_submissions", "penalized",
                      "total_num"]  # Column names on which we should
    # apply aggregations

    for target in targets:
        target_col = target
        prediction_target_col = "prediction_" + target.replace(" ", "")
        for measure in competition_config[target.replace(" ", "")]:
            # measure column example: "MAPE_Valeurs"
            measure_col = str(measure) + "_" + target.replace(" ", "")
            # sum column example: "sum(MAPE_Valeurs)"
            sum_col = "sum(" + str(measure) + "_" + target.replace(" ",
                                                                   "") + ")"
            measure_columns.append(measure_col)
            sum_columns.append(sum_col)
            if measure_col not in columns_to_sum:
                columns_to_sum.append(measure_col)
                # columns_to sum = ["latency", "num_submissions", "penalized", "MAPE_Valeurs"]

        target_columns.append(target_col)
        prediction_target_columns.append(prediction_target_col)

    checkpoint_locations = [
        "/tmp/" + competition.name.lower().replace(" ", "") +
        "prediction_checkpoint", "/tmp/" +
        competition.name.lower().replace(" ", "") + "training_checkpoint",
        "/tmp/" + competition.name.lower().replace(" ", "") +
        "measure_checkpoint"
    ]

    sparkEvaluator.evaluate(
        spark_context=spark_context,
        kafka_broker=kafka_server,
        competition=competition,
        competition_config=competition_config,
        window_duration=window_duration,
        prediction_window_duration=prediction_window_duration,
        train_schema=train_schema,
        prediction_schema=prediction_schema,
        columns_to_sum=columns_to_sum,
        checkpoints=checkpoint_locations,
        targets=targets)

    spark_context.stop()
Example #7
0
class QueryExecutor:
    def __init__(self, mongo_connection, models_dir):
        self.mongo_repository = MongoRepository(mongo_connection)
        self.mongo_repository.load_all_documents()
        nlp = NlpWrapper()
        self.feature_processor = FeatureProcessor(nlp)
        self.model_facade = ModelFacade(self.mongo_repository, models_dir)
        self.model_facade.load_models()
        self.model_facade.tf2wv.load_weighted_vector()

    def process_input(self, text):

        answers = self.retrieve_answers(text, threshold=0.78)
        scores, token_map, tokens_not_found = answers["scores"], answers[
            "token_map"], answers["tokens_not_found"]
        self.print_documents_for(scores)
        self.print_similar_words(token_map)

    def retrieve_answers(self, text, threshold=0.78, topn=20):
        logging.info("Retrieving answers for {}".format(text))
        text = self.feature_processor(text)
        tokens = text.lower().split()
        logging.info("Tokens after preprocessing : {}".format(tokens))

        trigrams, scores = self.model_facade.similar_doc(tokens)
        if (len(scores) > 0):
            token_map = self.model_facade.doc2vecFacade.retrieve_similar_words(
                trigrams, threshold=threshold, topn=topn)
            tokens_not_found = [
                word for word in trigrams if word not in token_map
            ]
            return {
                "scores": scores,
                "token_map": token_map,
                "tokens_not_found": tokens_not_found
            }
        else:
            return {
                "scores": [],
                "token_map": {},
                "tokens_not_found": trigrams
            }

    def retrieve_similar_words(self, tokens, threshold=0.78, topn=30):
        token_map = self.model_facade.doc2vecFacade.retrieve_similar_words(
            tokens, threshold=threshold, topn=topn)
        return token_map

    def retrieve_documents(self, all_scores, page_id):
        all_documents = []
        scores = all_scores[PER_PAGE * page_id:PER_PAGE * (page_id + 1)]
        for id, score, tfidf_score, wv_score in scores:
            mongo_document = self.mongo_repository.get_preprocessed_question(
                id)
            lines_answer = mongo_document.split('\n')
            all_documents.append({
                "question": lines_answer[0],
                "answer": "\n".join(lines_answer[1:]),
                "score": score,
                "tfidf_score": tfidf_score,
                "wv_score": wv_score
            })
        return all_documents

    def print_documents_for(self, scores):
        for id, score in scores:
            mongo_document = self.mongo_repository.get_preprocessed_question(
                id)
            print("======" + str(id) + "============================")
            print(mongo_document)

    def print_similar_words(self, tokenmap):
        print(" ============= SIMILAR WORDS ======W===============")
        for key, values in tokenmap.items():
            print(key, ", ".join([v[0] for v in values]))
Example #8
0
class DataStreamerServicer:
    """
    Datastream Servicer handles the communication with users. Sends the datastream records and handles the predictions
    that are sent by users.

    It creates the topics. Starts Kafka producers and loads the data structures for communication with users, generated
    from .proto file.

    """
    def __init__(self, server, competition, competition_config):
        """
        Construct the DatastreamServicer Class. Once the communication is triggered by the user, it starts publishing
        messages to be sent to that user and at the same time activates module for receiving the predictions.

        :param server: Kafka server IP address
        :param competition: Competition object
        :param competition_config: Competition configuration object
        """
        self.server = server
        self.producer = ProducerToMongoSink(server)
        self.predictions_producer = ProducerToMongoSink(server)
        conf_producer = {'bootstrap.servers': server}
        self.kafka_producer = Producer(conf_producer)
        self.consumers_dict = {}

        self.repo = MongoRepository(_MONGO_HOST)
        self.competition = competition

        # Defining three topics: input (competition name), output (competition name + predictions)
        # data (competition name + data)
        self.input_topic = competition.name.lower().replace(" ", "")
        self.output_topic = competition.name.lower().replace(" ", "") + 'data'
        self.spark_topic = competition.name.lower().replace(" ", "") + 'predictions'

        try:
            # create data_object dictionary with following fields: competition_id, dataset
            data_object = {}
            data_object['competition_id'] = str(self.competition.competition_id)
            data_object['dataset'] = []
            # Insert document in mongo repository with db name: data, collection name: data
            self.repo.insert_document('data', 'data', data_object)
        except Exception as e:
            pass

        # Import the right gRPC module
        # file_path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2.py
        pb2_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name, 'file_pb2.py')
        # grpc file path: ../local/data/uploads/competition_generated_code/competition_name -> file_pb2_grpc.py
        pb2_grpc_file_path = os.path.join(_UPLOAD_REPO, _COMPETITION_GENERATED_CODE, self.competition.name,
                                          'file_pb2_grpc.py')
        # import parent modules
        self.file_pb2 = imp.load_source('file_pb2', pb2_file_path)
        self.file_pb2_grpc = imp.load_source('file_pb2_grpc', pb2_grpc_file_path)
        # import classes
        self.DataStreamer = imp.load_source('file_pb2_grpc.DataStreamerServicer', pb2_grpc_file_path)
        self.Message = imp.load_source('file_pb2.Message', pb2_file_path)

        self.targets = []

        for key in competition_config.keys():
            y = str(key).replace(' ', '')  # Key
            self.targets.append(y)

        self.__bases__ = (self.DataStreamer,)  # ??

    def sendData(self, request_iterator, context):
        """
        After the user has initialized the communication with the server. It checks user's credentials and
        starts sending the data records.
        :param request_iterator: Sent by the user through gRPC/Protobuf protocol
        :param context: data
        :return:
        """
        _SUBSCRIPTION_REPO = SubscriptionRepository(_SQL_HOST, _SQL_DBNAME)
        _USER_REPO = UserRepository(_SQL_HOST, _SQL_DBNAME)
        _COMPETITION_REPO = CompetitionRepository(_SQL_HOST, _SQL_DBNAME)
        metadata = context.invocation_metadata()
        metadata = dict(metadata)
        token = metadata['authorization']

        user_id = metadata['user_id']
        competition_code = metadata['competition_id']

        user = _USER_REPO.get_user_by_email(user_id)
        _USER_REPO.cleanup()
        if user is None:
            context.set_code(grpc.StatusCode.PERMISSION_DENIED)
            context.set_details('You are not registered, please register on the website')
            return self.file_pb2.Message()

        competition = _COMPETITION_REPO.get_competition_by_code(competition_code)
        _COMPETITION_REPO.cleanup()
        if competition is None:
            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
            context.set_details('Unknown competition, please refer to the website')
            return self.file_pb2.Message()

        # TODO : for Subscription Data
        subscription = _SUBSCRIPTION_REPO.get_subscription(competition.competition_id, user.user_id)
        _SUBSCRIPTION_REPO.cleanup()
        if subscription is None:
            # TODO : Should close connection
            context.set_code(grpc.StatusCode.PERMISSION_DENIED)
            context.set_details('You are not allowed to participate, please subscribe to the competition on website')
            return self.file_pb2.Message()

        # TODO : check secret token
        decoded_token = decode_subscription_token(token)
        if decoded_token is None:
            print('Wrong token')
            # TODO : should close connection
            context.set_code(grpc.StatusCode.UNAUTHENTICATED)
            context.set_details('Please check your authentication credentials, Wrong Token!')
            return self.file_pb2.Message()

        decoded_token = decoded_token[1]

        token_competition_id = decoded_token['competition_id']
        token_user_id = decoded_token['user_id']

        if int(token_competition_id) != int(competition.competition_id) or token_user_id != user_id:
            # TODO : should close channel
            print('Using wrong token for this competition')
            context.set_code(grpc.StatusCode.UNAUTHENTICATED)
            context.set_details('Please check your authentication token, the secret key does not match')
            return self.file_pb2.Message()

        end_date = self.competition.end_date + 5 * datetime.timedelta(seconds=self.competition.predictions_time_interval)

        if user_id in self.consumers_dict:
            consumer = self.consumers_dict[user_id]
        else:
            consumer = Consumer({'group.id': user_id, 'bootstrap.servers': self.server,
                                 'session.timeout.ms': competition.initial_training_time * 10000,
                                 'auto.offset.reset': 'latest'})  # 172.22.0.2:9092
            consumer.subscribe([self.input_topic])
            self.consumers_dict[user_id] = consumer

        try:
            stop_thread = False
            t = threading.Thread(target=receive_predictions,
                                 kwargs={'predictions': request_iterator,
                                         'competition_id': self.competition.competition_id, 'user_id': user.user_id,
                                         'end_date': end_date, 'kafka_producer': self.kafka_producer,
                                         'spark_topic': self.spark_topic, 'targets': self.targets,
                                         'stop': lambda: stop_thread})
            # use default name
            t.start()
        except Exception as e:
            print(str(e))

        while context.is_active():
            message = consumer.poll(timeout=0)
            if message is None:
                continue
            else:
                try:
                    values = orjson.loads(message.value())
                    json_string = json.dumps(values, default=json_util.default)
                    message = self.file_pb2.Message()
                    final_message = json_format.Parse(json_string, message, ignore_unknown_fields=True)
                    time.sleep(0.01)
                    if context.is_active():
                        yield message
                    else:
                        break
                except Exception as e:
                    pass

            if datetime.datetime.now() > end_date:
                break
        logging.debug("disconnect")
        stop_thread = True
        t.join()
Example #9
0
    parser.add_argument('--append', dest="append", action='store_true')
    parser.set_defaults(import_qa=False,
                        process=False,
                        print_files=False,
                        model=False,
                        append=False)

    args = parser.parse_args()
    config = yaml.safe_load(open("../config.yml"))
    model_dir = config['models_dir']
    output_dir = config['output_dir']
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    data_dirs = config['data_dirs']
    mongo_connection = config['mongo_connection']
    mongo_repository = MongoRepository(mongo_connection)

    process_documents(data_dirs,
                      output_dir,
                      do_import=args.import_qa,
                      do_process=args.process,
                      do_print_files=args.print_files,
                      append=args.append)

    if args.model:
        logging.info("Started creation of model...")
        model_facade = ModelFacade(mongo_repository, model_dir)
        model_facade.create_model()
        logging.info("Finished started creation of model...")
Example #10
0
import yaml
from language import NlpWrapper
from repository import MongoRepository
from feature_extract import FeatureProcessor

if __name__ == '__main__':

    config = yaml.safe_load(open("config.yml"))
    data_dir = config['data_dir']
    mongo_connection = config['mongo_connection']
    mongo_repository = MongoRepository(mongo_connection)
    nlp = NlpWrapper()
    feature_processor = FeatureProcessor(nlp)
    mongo_repository.process_questions(
        source_collection=mongo_repository.preprocessed_questions,
        target_collection=mongo_repository.processed_questions,
        processor=feature_processor)
Example #11
0
import yaml
from repository import MongoRepository

if __name__ == '__main__':
    config = yaml.safe_load(open("config.yml"))
    data_dir = config['data_dir']
    mongo_connection = config['mongo_connection']
    mongo_repository = MongoRepository(mongo_connection)
    mongo_repository.import_questions(data_dir)
Example #12
0
try:
    _SQL_DBNAME = os.environ['SQL_DBNAME']
except Exception:
    _SQL_DBNAME = config['SQL_DBNAME']

try:
    _MONGO_HOST = os.environ['MONGO_HOST']
except Exception:
    _MONGO_HOST = config['MONGO_HOST']

_COMPETITION_REPO = CompetitionRepository(_SQL_HOST, _SQL_DBNAME)
_DATASTREAM_REPO = DatastreamRepository(_SQL_HOST, _SQL_DBNAME)
_USER_REPO = UserRepository(_SQL_HOST, _SQL_DBNAME)
_SUBSCRIPTION_REPO = SubscriptionRepository(_SQL_HOST, _SQL_DBNAME)
_MONGO_REPO = MongoRepository(_MONGO_HOST)

# Standard evaluation measures, should be written in MongoDB if they don't exist there already
standard_measures = [{
    'id': 1,
    'name': 'MAPE',
    'type': 'regression'
}, {
    'id': 2,
    'name': 'MSE',
    'type': 'regression'
}, {
    'id': 3,
    'name': 'MAE',
    'type': 'regression'
}, {