def save_shape(self, output, dict_file=None): """ Save the shape of the transformation model to json file (for further use in ruby code) """ self.check_model() # recover words vector from model # reformat it for a more practical use: dictionary word:word_weights words_vector = reformat_wv(self.model.wv) self.logger.info("Save {} shape: {}-weigths list for each word".format( self.name, self.vsize)) utils.create_path(output) if dict_file is not None: # remove unknown words when requested words_vector = self.filter_words(words_vector, dict_file) with open(output, 'w') as ostream: for value in words_vector: json.dump(value, ostream) ostream.write('\n') else: # keep all words present in the model with open(output, 'w') as ostream: for key, value in words_vector.items(): json.dump({key: list(value)}, ostream, ensure_ascii=False) ostream.write('\n') self.logger.info( "Saved {} transformation's model shape under '{}'".format( self.name, output))
def store_stats(self, output): """Store statistics on predictions""" self.logger.info( "Save statistics on predictions to '{}'".format(output)) utils.create_path(output) with open(output, 'w') as ostream: ostream.write(json.dumps(self.stats, indent=2))
def save(self, output): """ Save the transformation model to binary file (to enable load ability) """ self.check_model() utils.create_path(output) self.model.save(output)
def load_and_save_as_json(input_file, output_file): cdict = Dictionary.load(input_file) rdict = {} for wid in sorted(cdict.keys()): rdict[cdict[wid]] = wid utils.create_path(output_file) with open(output_file, 'w') as ostream: json.dump(rdict, ostream, indent=2, sort_keys=True, ensure_ascii=False)
def __init__(self, output_file): """Initialize with the input filename""" super().__init__() if not isinstance(output_file, str): raise ConfigError( "Given parameter {} is not a String".format(output_file)) self.logger.info('Initialized empty corpus') self.logger.info("Save new corpus in {} file".format(output_file)) utils.create_path(output_file) self.ofstream = open(output_file, 'w') self.size = 0
def save(self, output, n=50): """Save the top 'n' words for each topic""" self.logger.info("Display the top {} words for each topic".format(n)) topics = self.model.show_topics(-1, n, log=False) wtopics = [] for _, topic in topics: words = re.findall(r'"(.*?)"', topic) if words: wtopics.append(words) utils.create_path(output) with open(output, 'w') as ostream: json.dump(wtopics, ostream, ensure_ascii=False, indent=2) self.logger.info("{} topics saved in '{}' json file'".format( len(wtopics), output))
def save_shape(self, output): """ Save the shape of the transformation model to json file (for further use in ruby code) """ self.check_model() utils.create_path(output) desc = '' saved = False if self.name == 'TFIDF': desc = 'list with word_idf weight for each word id' model_array = [0.0] * len(self.model.idfs) for wid, weight in self.model.idfs.items(): model_array[int(wid)] = float(weight) with open(output, 'w') as ostream: json.dump(model_array, ostream, indent=2) saved = True elif self.name == 'LSI': desc = "{}D array for each word id".format(self.ntopics) model_array = [[0.0] * self.ntopics] * self.model.num_terms for wid in range(self.model.num_terms): model_array[int(wid)] = \ [float(x) for x in self.model.projection.u[int(wid)]] with open(output, 'w') as ostream: for weights in model_array: json.dump(weights, ostream) ostream.write('\n') saved = True else: self.logger.warning('Unknown demand. Probably still WIP') if saved: self.logger.info("Saved {}'s model shape ({}) under '{}'".format( self.name, desc, output))
def store_prediction(self, input_file, output_file): """ Test the classifier on 'untagged' documents. Store prediction category and prediction probability in file. """ if not self.prediction_checkups(): return utils.check_file_readable(input_file) utils.create_path(output_file) sc = StreamCorpus(input_file) try: pc = PushCorpus(output_file) for doc in sc: if 'features' in doc: prediction = self.classify_doc(doc['features']) if isinstance(prediction, dict) and \ 'category' in prediction and \ 'probas' in prediction: doc['season'] = prediction['category'] doc['season_prob'] = prediction['probas'] pc.add(doc) except Exception as e: raise CaughtException( "Exception encountered when storing classified documents: {}". format(e)) else: self.logger.info("Stored {} documents to file".format(pc.size)) finally: pc.close_stream()
def save(output, corpus, progress_cnt=1000): """Save a gensim MmCorpus to binary file""" utils.create_path(output) MmCorpus.serialize(output, corpus, progress_cnt=progress_cnt)
def store_stats(self, roc_file, plot_file): """ Compute the sklearn metrics on given predicted documents, with respect to each categoty tag. Plot the ROC curve (FP/TP) on each category tag. """ # length of longest category name nmax = max(len(cat) for cat in self.categories) # stats header inside generated plots header = "{} {:>6} {:>6} {:>6} {:>6}".format(' ' * nmax, 'P', 'R', 'Acc', 'AUC') utils.create_path(roc_file) utils.create_path(plot_file) roc = {} _, fig = pylab.subplots() for index, category in enumerate(self.categories): true_labels = [] pred_labels = [] pred_probas = [] for input_file in self.data_files: sc = StreamCorpus(input_file) for doc in sc: if 'category' in doc and 'season' in doc \ and 'season_prob' in doc: # convert real categories to list real_categories = doc['category'] if isinstance(real_categories, str): real_categories = [real_categories] # convert predicted categories to list pred_categories = doc['season'] if isinstance(pred_categories, str): pred_categories = [pred_categories] # check if 'category' # present in the list of real categories real_cat = 1 if category in real_categories else 0 # check if 'category' # present in the list of predicted categories pred_cat = 1 if category in pred_categories else 0 pred_proba = doc['season_prob'][category] true_labels.append(real_cat) pred_labels.append(pred_cat) pred_probas.append(pred_proba) prec = sklearn.metrics.precision_score(true_labels, pred_labels) recall = sklearn.metrics.recall_score(true_labels, pred_labels) acc = sklearn.metrics.accuracy_score(true_labels, pred_labels) cm = sklearn.metrics.confusion_matrix(true_labels, pred_labels) rep = sklearn.metrics.classification_report( true_labels, pred_labels) auc = sklearn.metrics.roc_auc_score(true_labels, pred_probas) prec = percentage(prec) recall = percentage(recall) acc = percentage(acc) auc = percentage(auc) self.logger.info("Category: {}".format(category)) self.logger.info("Precision: {}%".format(prec)) self.logger.info("Recall: {}%".format(recall)) self.logger.info("Accuracy: {}%".format(acc)) self.logger.info("AUC: {}%".format(auc)) self.logger.info("Confusion-matrix:\n{}".format(cm)) self.logger.info("Report:\n{}".format(rep)) fpr, tpr, thr = sklearn.metrics.roc_curve(true_labels, pred_probas, 1) # convert numpy array into array fpr = [float(x) for x in fpr] tpr = [float(x) for x in tpr] # store roc values roc[category] = [[threshold, fpr[tindex], tpr[tindex]] for tindex, threshold in enumerate(thr)] # plot current ROC curve fig.plot(fpr, tpr, label=category) # annotate current precision, recall, accuracy and AUC scores cdisplay = category.rjust(nmax) fig.annotate(header, (0.4, 0.67), size=9, family='monospace') fig.annotate("{} {} {} {} {}".format(cdisplay, prec, recall, acc, auc), (0.4, 0.6 - 0.07 * index), size=9, family='monospace') fig.legend(labels=self.categories) pylab.title('Classifier performance (ROC curve)') pylab.xlabel('False positive rate') pylab.ylabel('True positive rate') pylab.savefig(plot_file) pylab.close() self.logger.info("Plot saved under {} file".format(plot_file)) # save ROC values to file with open(roc_file, 'w') as of: for category in roc: of.write(category + '\n') for entry in roc[category]: of.write(' '.join(str(x) for x in entry) + '\n') self.logger.info("ROC values saved under {} file".format(roc_file))