Example #1
0
    def set_model(self):
        # load pre-trained, before clustering normalized word2ec
        self.org_model = gensim.models.KeyedVectors.load(
            self.pre_cluster_we_file,
            mmap='r').wv  # mmap the large matrix as read-only
        self.org_model.syn0norm = self.org_model.syn0

        # load pre-trained, normalized word2ec
        self.model = gensim.models.KeyedVectors.load(
            self.we_file, mmap='r').wv  # mmap the large matrix as read-only
        self.model.syn0norm = self.model.syn0

        #load adjectives with multi-sense representation
        adj_file_names = [
            f for f in os.listdir(adj_clusters_path)
            if os.path.isfile(os.path.join(adj_clusters_path, f))
        ]
        self.multi_sense_adj = dict.fromkeys(
            [os.path.splitext(f)[0].split('_')[0] for f in adj_file_names])
        logger.info("Total multi sense adjectives = [{}]".format(
            len(self.multi_sense_adj)))

        #generate list of all the words with word vectors
        self.vocab = dict(self.model.vocab, **self.multi_sense_adj)
        logger.info("VOCAB SIZE =[{}]".format(len(self.vocab)))
Example #2
0
    def get():

        try:
            sentence = request.args.get('avis', None)

        except Exception:
            abort(http_codes.SERVER_ERROR,
                  "Erreur lors du chargement du modèle")

        logger.info("Analyse de {sentence}".format(sentence=sentence))

        opinions, summury = sentiment_analysis(model_tag, model_sa, flags,
                                               source_count, source_word2idx,
                                               sentence, fr_nlp, wiki_model)

        response = {
            'aspects': [{
                'target': opinion[0],
                'category': opinion[1],
                'from': opinion[2],
                'to': opinion[3],
                'sentiment': opinion[4],
                'exemple': opinion[5]
            } for opinion in opinions],
            'summury': [{
                'category': sum[0],
                'sentiment': sum[1]
            } for sum in summury],
        }

        logger.info(response)

        return _success(response)
Example #3
0
    def online_training(self, epochs=EPHOCS):
        running_loss = 0.0
        y_train = self.data.y_train
        x_train = self.data.x_train
        indices = range(y_train.shape[0])
        for epoch in range(epochs):
            logger.info("Epoch: {}".format(epoch))
            random.shuffle(indices)
            for i in indices:

                x = Variable(torch.Tensor(x_train[[i]]))
                y = Variable(torch.Tensor(y_train[[i]]), requires_grad=False)

                # pytorch doesn't support directly in training without batching so this is kind of a hack
                x.unsqueeze(0)
                y.unsqueeze(0)

                # Forward pass: Compute predicted y by passing x to the model
                y_pred = self.nn_model(x)

                # Compute and print loss
                loss = self.criterion(y_pred, y)
                # print(epoch, loss.data[0])

                # Zero gradients, perform a backward pass, and update the weights.
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # print statistics
                # running_loss += loss.data[0]
                # if i % 100 == 99:  #
                #     print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100))
                #     running_loss = 0.0
        logger.info("Done online training")
Example #4
0
def main(args):
    # global parser, args, dev_triplets, test_triplets, we_wrapper, data_handler, model
    parser = argparse.ArgumentParser(description='Train word2vec model.')
    parser.add_argument('dev_file', help='dev input file')
    parser.add_argument('test_file', help='test input file')
    parser.add_argument('we_file', help='word embeddings normed model file')
    # parser.add_argument('output_folder', help='path to the output folder')
    parser.add_argument(
        'org_we_file',
        help='path to the original we model file - before adjectives clustering'
    )
    parser.add_argument('-s',
                        '--supervised',
                        default=False,
                        action='store_true',
                        help='train and evaluate also the supervised model')
    args = parser.parse_args(args)
    dev_triplets = read_HeiPLAS_data(args.dev_file)
    test_triplets = read_HeiPLAS_data(args.test_file)
    # load pre-trained, normalized word2ec
    we_wrapper = MultiSenseWE(args.org_we_file, args.we_file)
    we_wrapper.set_model()
    data_handler = DataHandler(we_wrapper)
    data_handler.run(dev_triplets, test_triplets)
    if args.supervised:
        model = SupervisedModel(data_handler)
        model.run()
    model = UnsupervisedModel(data_handler)
    model.run()
    logger.info("Done!!!!!")
    def run(self, dev_triplets, test_triplets):
        logger.info("filter training samples")
        self.train, self.x_train, self.y_train = self.filter_data(dev_triplets)
        logger.info("filter test samples")
        self.test, self.x_test, self.y_test = self.filter_data(test_triplets)

        dev_attributes = set([triplet.attr for triplet in dev_triplets if triplet.attr in self.we_wrapper.vocab])
        test_attributes = set([triplet.attr for triplet in test_triplets if triplet.attr in self.we_wrapper.vocab])
        self.attributes = dev_attributes.union(test_attributes)

        self.attr_vecs  = {attr: self.we_wrapper.word_vec(attr) for attr in self.attributes}
Example #6
0
def _get_settings_from_local_file(file_name='settings.json'):
    logger.info('Loading settings from local file...')

    file_path = Path(__file__).parent / file_name
    loaded_settings = {}
    if file_path.exists() and file_path.is_file():
        with file_path.open() as file:
            loaded_settings.update(json.load(file))
    else:
        logger.error(f'Failed loading {file_path.absolute()} file')

    return loaded_settings['parameters']
Example #7
0
def _failure(exception, http_code=http_codes.SERVER_ERROR):
    try:
        exn = traceback.format_exc(exception)
        logger.info("EXCEPTION: {}".format(exn))
    except:
        logger.info("EXCEPTION: {}".format(exception))
    try:
        data, code = exception.to_tuple()
        return make_reponse(data, code)
    except:
        try:
            data = exception.to_dict()
            return make_reponse(data, exception.http)
        except Exception:
            return make_reponse(None, http_code)
Example #8
0
def read():
    # Assume the file is located in commons
    # Assume that the file {root-project}/conf/conf.yml exists
    #
    # {root-project}
    # - conf/
    #   - conf.yml
    #   - ...
    # - commons/
    #   - configuration.py
    #   - ...
    SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
    path = os.path.join(SCRIPT_DIR, 'conf/api-conf.yml')
    logger.info("Using conf file: {}".format(path))
    return yaml.load(open(path))
Example #9
0
    def get():
        """
            Retourne un Qui/Quoi/Ou/Proximité utilisable dans le cadre du portail PJ
        """

        if not entity_models:
            abort(http_codes.BAD_REQUEST,
                  "Aucun modèle de détection d'entité n'a été chargé")

        phrase, entity_model_name, redressage_active = None, None, True
        try:
            phrase = request.args.get('phrase', None)
            redressage = request.args.get('redressage', None)
            if redressage and redressage.lower() == "false":
                redressage_active = False

            # next(iter(dictionnary)) => renvoie la 'première' clé d'un dictionnaire
            entity_model_name = request.args.get('entity_model',
                                                 next(iter(entity_models)))

        except Exception:
            abort(http_codes.SERVER_ERROR,
                  "Erreur lors du chargement du modèle")

        logger.info("Analyse de {sentence} avec le modèle {entity}".format(
            sentence=phrase, entity=entity_model_name))

        entity_model = None
        try:
            entity_model = entity_models.get(entity_model_name)
        except KeyError:
            abort(
                http_codes.BAD_REQUEST,
                "Le modele de prédiction d'entité {} n'existe pas.".format(
                    entity_model_name))

        entity_prediction = entity_model.predict(phrase, redressage_active)
        response = {
            'qui': entity_prediction.to_dict()["qui"],
            'quoi': entity_prediction.to_dict()["quoi"],
            'ou': entity_prediction.to_dict()["ou"],
            'proximite': entity_prediction.to_dict()["proximite"],
            'analyse': entity_prediction.to_dict()["analyse"]
        }
        logger.info(response)

        return _success(response)
    def filter_data(self, triplets):
        logger.info("before filter missing words, samples: " + str(len(triplets)))
        filtered_data  = [samp for samp in triplets
                     if samp.adj in self.we_wrapper.vocab and samp.noun in self.we_wrapper.vocab
                     and samp.attr in self.we_wrapper.vocab ]
        logger.info("after filter missing words, samples: " + str(len(filtered_data)))
        x_matrix = np.array([self.we_wrapper.adj_vec_by_context(samp.adj,samp.noun) for samp in filtered_data])
        y_matrix = np.array([self.we_wrapper.word_vec(samp.attr) for samp in filtered_data])
        logger.info("x shape: "+ str(x_matrix.shape))
        logger.info("y_train: " + str(y_matrix.shape))

        return filtered_data, x_matrix, y_matrix
Example #11
0
def main(args):
    # global start_time, parser, args, free_cores, cores, uptime_data, load_avg, used_cores, sentences, model_name, model_path, normed_model_path, model, end_time, hours, rem, minutes, seconds
    start_time = time.time()
    # Set up command line parameters.
    parser = argparse.ArgumentParser(description='Train word2vec model.')
    parser.add_argument(
        'input_file', help='input file path for the word embeddings training')
    args = parser.parse_args(args)
    free_cores = 1
    if PARALLEL_FLAG:
        cores = multiprocessing.cpu_count()
        uptime_data = os.popen("uptime").read().split()
        load_avg = float(
            uptime_data[-3].strip(',')
        )  # take the load average of the last minute(the third from the end)
        used_cores = math.ceil(load_avg / cores)
        free_cores = min(cores - used_cores, MAX_CORES_TO_USE)
        logger.info("running with {} threads".format(free_cores))
    sentences = LineSentence(args.input_file)
    model_name = args.input_file
    model_path = get_we_model_full_path(model_name)
    normed_model_path = get_normed_we_full_path(model_name)
    logger.info("Start training word2vec on file: {}".format(args.input_file))
    model = Word2Vec(sentences,
                     size=DIMENSION,
                     alpha=LEARNING_RATE,
                     window=CONTEXT_WINDOW,
                     workers=free_cores,
                     iter=EPOCHS)
    logger.info("done word2ve training")
    logger.info("saving model to: {}".format(model_path))
    model.save(model_path)
    logger.info("saving normalized model to: {}".format(normed_model_path))
    model.init_sims(replace=True)
    model.save(normed_model_path)
    end_time = time.time()
    hours, rem = divmod(end_time - start_time, 3600)
    minutes, seconds = divmod(rem, 60)
    logging.info("total training time{:0>2}:{:0>2}:{:05.2f}".format(
        int(hours), int(minutes), seconds))
Example #12
0
def main(args):
    # global start_time, parser, args, runner, stop
    start_time = timeit.default_timer()
    parser = argparse.ArgumentParser(
        description='Generate adjectives senses by nouns clustering')
    parser.add_argument('sentences_input_file',
                        help='input file path - sentences format')
    parser.add_argument('word_embeddings_file',
                        help='word embeddings model file path')
    # parser.add_argument('pickled_adj_folder',help='word embeddings model file path')
    parser.add_argument(
        'sentences_output_file',
        help=
        'the generated file for WE training after adjectives clustering and labeling file path'
    )
    parser.add_argument(
        '-ss',
        '--only_sub_set',
        default=False,
        action='store_true',
        help='analyze only subset of adjectives from config file')
    parser.add_argument('-p',
                        '--outliers_clustering_by_patterns',
                        default=False,
                        action='store_true',
                        help='cluster dbscan outliers using patterns')
    args = parser.parse_args(args)
    # logging.basicConfig(filename='adj_sense_extractor.log', level=logging.DEBUG)
    logger.info('start')
    logger.info("loading word embedding model from {}".format(
        args.word_embeddings_file))
    we_model.load_model(args.word_embeddings_file)
    runner = AdjSensesClusteringRunner(args.sentences_input_file,
                                       args.sentences_output_file,
                                       args.only_sub_set,
                                       args.outliers_clustering_by_patterns)
    runner.run()
    stop = timeit.default_timer()
    print "DONE!"
    print "Total running time {}".format(stop - start_time)
Example #13
0
    file_path = Path(__file__).parent / file_name
    loaded_settings = {}
    if file_path.exists() and file_path.is_file():
        with file_path.open() as file:
            loaded_settings.update(json.load(file))
    else:
        logger.error(f'Failed loading {file_path.absolute()} file')

    return loaded_settings['parameters']


# Get all the settings from SSM
settings = _get_settings_from_local_file()

# logging commit_hash and stage-params
commit_hash = settings.get('commit_hash')
if commit_hash:
    logger.info(
        f'Running commit: {commit_hash} and loaded params for stage:{os.environ.get("stage")}'
    )
else:
    logger.info('No commit hash on settings')

# Add environment variables comming from serverless

# Service ID
settings['SERVICE'] = os.environ.get('serviceId')

# Stage
settings['STAGE'] = os.environ.get('stage')
Example #14
0
    def test(self):
        we_wrapper = self.data.we_wrapper
        weights = self.nn_model.linear_1.weight.data.numpy()

        x_test = self.data.x_test
        y_test = self.data.y_test
        attr_vecs = self.data.attr_vecs

        print "attr_vecs size = {}".format(len(attr_vecs))
        print "x test shape: " + str(x_test.shape)
        print "y_test: " + str(y_test.shape)
        print "weights shape: {}".format(weights.shape)

        x_test_matrix = np.dot(weights, np.transpose(x_test))
        print "x_test matrix shape = {}".format(x_test_matrix.shape)

        # check P@1 and P@5 accuracy
        correct = 0.0
        top_5_correct = 0.0
        correct_pred = []
        false_pred = []
        results = []
        for i in xrange(0, x_test_matrix.shape[1]):
            y_pred = x_test_matrix[:, [i]]

            #calculate cosine similarity for normalized vectors
            cosine_sims = {
                attr: np.dot(y_pred.T, attr_vecs[attr])
                for attr in attr_vecs.keys()
            }
            sorted_sims = dict(
                sorted(cosine_sims.iteritems(),
                       key=operator.itemgetter(1),
                       reverse=True)[:K])
            most_sim_attr = max(sorted_sims, key=lambda i: sorted_sims[i])
            if most_sim_attr == self.data.test[i].attr:
                correct += 1
                correct_pred.append(self.data.test[i])
            else:
                false_pred.append((self.data.test[i], most_sim_attr))
            if self.data.test[i].attr in sorted_sims.keys():
                top_5_correct += 1
            results.append((self.data.test[i], most_sim_attr))
        logger.info("supervised results")
        logger.info("correct: {} from total: {}. Accuracy: {}".format(
            correct, y_test.shape[0], correct / y_test.shape[0]))
        logger.info("top 5 correct: {} from total: {}. Accuracy: {}".format(
            top_5_correct, y_test.shape[0], top_5_correct / y_test.shape[0]))

        with open(correct_predictions_file, 'w') as file:
            for item in correct_pred:
                # output = ' '.join([str(item), item[1].upper()])
                print >> file, item

        with open(false_prediction_file, 'w') as file:
            for item in false_pred:
                output = ' '.join([str(item[0]), item[1].upper()])
                print >> file, output

        with open(test_results, 'w') as file:
            for item in results:
                # output =  ' '.join([item[1].upper(), item[0]].adj, item[0].noun)
                print >> file, str(item[0])
Example #15
0
    def test(self):

        x_test = self.data.x_test
        y_test = self.data.y_test
        attr_vecs = self.data.attr_vecs

        logger.info("attr_vecs size = {}".format(len(attr_vecs)))
        logger.info("x test shape: " + str(x_test.shape))
        logger.info("y_test: " + str(y_test.shape))

        correct = 0.0
        correct_in_K = 0.0
        predictions = []
        unique_attributes = attr_vecs.keys()
        attr_vecs_ordered = np.array([
            self.data.we_wrapper.word_vec(attr) for attr in unique_attributes
        ]).squeeze()
        for test in self.data.test:

            adj_label = self.data.we_wrapper.get_adj_name(test.adj, test.noun)
            adj_vec = self.data.we_wrapper.adj_vec_by_context(
                test.adj, test.noun)
            # adj_vec = we_wrapper.org_model.word_vec(test.adj)
            sim = np.dot(adj_vec, attr_vecs_ordered.T)
            all_attr_idx = sim.argsort()[-244:][::-1]
            attr_all_preds = [unique_attributes[i] for i in all_attr_idx]
            # attr_ids = sim.argsort()[-K:][::-1]
            # adj_preds = [unique_attributes[i] for i in attr_ids]
            adj_preds = attr_all_preds[:K]
            correct_pred_idx = attr_all_preds.index(test.attr)
            predictions.append((AdjNounAttribute(test.adj, test.noun,
                                                 test.attr), adj_preds[0],
                                adj_label, correct_pred_idx))
            if adj_preds[0] == test.attr:
                correct += 1
            if test.attr in adj_preds:
                correct_in_K += 1

        with open(unsupervised_results, 'w') as file:
            for item in predictions:
                string = ' '.join(
                    [str(item[0]), item[1].upper(), item[2],
                     str(item[3])])
                print >> file, string

        logger.info("----unsupervised results-----")
        logger.info("correct = {}, total: {}, accuracy: {}".format(
            correct, len(self.data.test), correct / len(self.data.test)))
        logger.info("correct_in_{} = {}, total: {}, accuracy: {}".format(
            K, correct_in_K, len(self.data.test),
            correct_in_K / len(self.data.test)))
Example #16
0
def access_log():
    logger.info("{0} {1}".format(request.method, request.path))
Example #17
0
            return make_reponse(None, http_code)


def make_reponse(p_object=None, status_code=200):
    """
        Fabrique un objet Response à partir d'un p_object et d'un status code
    """
    if p_object is None and status_code == 404:
        p_object = {
            "status": {
                "status_content": [{
                    "code": "404 - Not Found",
                    "message": "Resource not found"
                }]
            }
        }

    json_response = jsonify(p_object)
    json_response.status_code = status_code
    json_response.content_type = 'application/json;charset=utf-8'
    json_response.headers['Cache-Control'] = 'max-age=3600'
    return json_response


if __name__ == "__main__":
    # Run http REST stack
    logger.info("Run api on {}:{}".format(conf['host'], conf['port']))
    app.run(host=conf['host'],
            port=int(conf['port']),
            debug=conf['log']['level'] == "DEBUG")