Exemple #1
0
    def save_shape(self, output, dict_file=None):
        """
        Save the shape of the transformation model to json file
        (for further use in ruby code)
        """

        self.check_model()

        # recover words vector from model
        # reformat it for a more practical use: dictionary word:word_weights
        words_vector = reformat_wv(self.model.wv)

        self.logger.info("Save {} shape: {}-weigths list for each word".format(
            self.name, self.vsize))

        utils.create_path(output)

        if dict_file is not None:
            # remove unknown words when requested
            words_vector = self.filter_words(words_vector, dict_file)
            with open(output, 'w') as ostream:
                for value in words_vector:
                    json.dump(value, ostream)
                    ostream.write('\n')
        else:
            # keep all words present in the model
            with open(output, 'w') as ostream:
                for key, value in words_vector.items():
                    json.dump({key: list(value)}, ostream, ensure_ascii=False)
                    ostream.write('\n')

        self.logger.info(
            "Saved {} transformation's model shape under '{}'".format(
                self.name, output))
    def store_stats(self, output):
        """Store statistics on predictions"""

        self.logger.info(
            "Save statistics on predictions to '{}'".format(output))

        utils.create_path(output)
        with open(output, 'w') as ostream:
            ostream.write(json.dumps(self.stats, indent=2))
Exemple #3
0
    def save(self, output):
        """
        Save the transformation model to binary file
        (to enable load ability)
        """

        self.check_model()
        utils.create_path(output)
        self.model.save(output)
Exemple #4
0
def load_and_save_as_json(input_file, output_file):
    cdict = Dictionary.load(input_file)

    rdict = {}
    for wid in sorted(cdict.keys()):
        rdict[cdict[wid]] = wid

    utils.create_path(output_file)
    with open(output_file, 'w') as ostream:
        json.dump(rdict, ostream, indent=2, sort_keys=True, ensure_ascii=False)
Exemple #5
0
    def __init__(self, output_file):
        """Initialize with the input filename"""

        super().__init__()

        if not isinstance(output_file, str):
            raise ConfigError(
                "Given parameter {} is not a String".format(output_file))

        self.logger.info('Initialized empty corpus')
        self.logger.info("Save new corpus in {} file".format(output_file))

        utils.create_path(output_file)

        self.ofstream = open(output_file, 'w')
        self.size = 0
Exemple #6
0
    def save(self, output, n=50):
        """Save the top 'n' words for each topic"""

        self.logger.info("Display the top {} words for each topic".format(n))

        topics = self.model.show_topics(-1, n, log=False)
        wtopics = []
        for _, topic in topics:
            words = re.findall(r'"(.*?)"', topic)
            if words:
                wtopics.append(words)

        utils.create_path(output)
        with open(output, 'w') as ostream:
            json.dump(wtopics, ostream, ensure_ascii=False, indent=2)

        self.logger.info("{} topics saved in '{}' json file'".format(
            len(wtopics), output))
Exemple #7
0
    def save_shape(self, output):
        """
        Save the shape of the transformation model to json file
        (for further use in ruby code)
        """

        self.check_model()
        utils.create_path(output)

        desc = ''
        saved = False

        if self.name == 'TFIDF':
            desc = 'list with word_idf weight for each word id'

            model_array = [0.0] * len(self.model.idfs)
            for wid, weight in self.model.idfs.items():
                model_array[int(wid)] = float(weight)

            with open(output, 'w') as ostream:
                json.dump(model_array, ostream, indent=2)

            saved = True
        elif self.name == 'LSI':
            desc = "{}D array for each word id".format(self.ntopics)

            model_array = [[0.0] * self.ntopics] * self.model.num_terms
            for wid in range(self.model.num_terms):
                model_array[int(wid)] = \
                    [float(x) for x in self.model.projection.u[int(wid)]]

            with open(output, 'w') as ostream:
                for weights in model_array:
                    json.dump(weights, ostream)
                    ostream.write('\n')
            saved = True
        else:
            self.logger.warning('Unknown demand. Probably still WIP')

        if saved:
            self.logger.info("Saved {}'s model shape ({}) under '{}'".format(
                self.name, desc, output))
Exemple #8
0
    def store_prediction(self, input_file, output_file):
        """
        Test the classifier on 'untagged' documents.
        Store prediction category and prediction probability in file.
        """

        if not self.prediction_checkups():
            return

        utils.check_file_readable(input_file)
        utils.create_path(output_file)

        sc = StreamCorpus(input_file)

        try:
            pc = PushCorpus(output_file)

            for doc in sc:
                if 'features' in doc:
                    prediction = self.classify_doc(doc['features'])

                    if isinstance(prediction, dict) and \
                        'category' in prediction and \
                        'probas' in prediction:

                        doc['season'] = prediction['category']
                        doc['season_prob'] = prediction['probas']
                        pc.add(doc)
        except Exception as e:
            raise CaughtException(
                "Exception encountered when storing classified documents: {}".
                format(e))
        else:
            self.logger.info("Stored {} documents to file".format(pc.size))
        finally:
            pc.close_stream()
Exemple #9
0
def save(output, corpus, progress_cnt=1000):
    """Save a gensim MmCorpus to binary file"""

    utils.create_path(output)
    MmCorpus.serialize(output, corpus, progress_cnt=progress_cnt)
Exemple #10
0
    def store_stats(self, roc_file, plot_file):
        """
        Compute the sklearn metrics on given predicted documents,
        with respect to each categoty tag.
        Plot the ROC curve (FP/TP) on each category tag.
        """

        # length of longest category name
        nmax = max(len(cat) for cat in self.categories)

        # stats header inside generated plots
        header = "{} {:>6} {:>6} {:>6} {:>6}".format(' ' * nmax, 'P', 'R',
                                                     'Acc', 'AUC')

        utils.create_path(roc_file)
        utils.create_path(plot_file)

        roc = {}
        _, fig = pylab.subplots()

        for index, category in enumerate(self.categories):
            true_labels = []
            pred_labels = []
            pred_probas = []

            for input_file in self.data_files:
                sc = StreamCorpus(input_file)

                for doc in sc:
                    if 'category' in doc and 'season' in doc \
                        and 'season_prob' in doc:

                        # convert real categories to list
                        real_categories = doc['category']
                        if isinstance(real_categories, str):
                            real_categories = [real_categories]

                        # convert predicted categories to list
                        pred_categories = doc['season']
                        if isinstance(pred_categories, str):
                            pred_categories = [pred_categories]

                        # check if 'category'
                        # present in the list of real categories
                        real_cat = 1 if category in real_categories else 0

                        # check if 'category'
                        # present in the list of predicted categories
                        pred_cat = 1 if category in pred_categories else 0

                        pred_proba = doc['season_prob'][category]

                        true_labels.append(real_cat)
                        pred_labels.append(pred_cat)
                        pred_probas.append(pred_proba)

            prec = sklearn.metrics.precision_score(true_labels, pred_labels)
            recall = sklearn.metrics.recall_score(true_labels, pred_labels)
            acc = sklearn.metrics.accuracy_score(true_labels, pred_labels)

            cm = sklearn.metrics.confusion_matrix(true_labels, pred_labels)
            rep = sklearn.metrics.classification_report(
                true_labels, pred_labels)
            auc = sklearn.metrics.roc_auc_score(true_labels, pred_probas)

            prec = percentage(prec)
            recall = percentage(recall)
            acc = percentage(acc)
            auc = percentage(auc)

            self.logger.info("Category:  {}".format(category))
            self.logger.info("Precision: {}%".format(prec))
            self.logger.info("Recall:    {}%".format(recall))
            self.logger.info("Accuracy:  {}%".format(acc))
            self.logger.info("AUC:       {}%".format(auc))
            self.logger.info("Confusion-matrix:\n{}".format(cm))
            self.logger.info("Report:\n{}".format(rep))

            fpr, tpr, thr = sklearn.metrics.roc_curve(true_labels, pred_probas,
                                                      1)

            # convert numpy array into array
            fpr = [float(x) for x in fpr]
            tpr = [float(x) for x in tpr]

            # store roc values
            roc[category] = [[threshold, fpr[tindex], tpr[tindex]]
                             for tindex, threshold in enumerate(thr)]

            # plot current ROC curve
            fig.plot(fpr, tpr, label=category)

            # annotate current precision, recall, accuracy and AUC scores
            cdisplay = category.rjust(nmax)
            fig.annotate(header, (0.4, 0.67), size=9, family='monospace')
            fig.annotate("{}  {}  {}  {}  {}".format(cdisplay, prec, recall,
                                                     acc, auc),
                         (0.4, 0.6 - 0.07 * index),
                         size=9,
                         family='monospace')

        fig.legend(labels=self.categories)

        pylab.title('Classifier performance (ROC curve)')
        pylab.xlabel('False positive rate')
        pylab.ylabel('True positive rate')

        pylab.savefig(plot_file)
        pylab.close()
        self.logger.info("Plot saved under {} file".format(plot_file))

        # save ROC values to file
        with open(roc_file, 'w') as of:
            for category in roc:
                of.write(category + '\n')
                for entry in roc[category]:
                    of.write(' '.join(str(x) for x in entry) + '\n')

        self.logger.info("ROC values saved under {} file".format(roc_file))