def summarize(inDir, params): logger.info("Loading documents from %s", inDir) c = Corpus(inDir).load( params, translate=True, simplify=(params['simplify'] is not None), replaceWithSimplified=(params['simplify'] == 'early')) logger.info("Setting up summarizer") M_en = cosine_similarity(c.getSentenceVectors()) np.fill_diagonal(M_en, 0) M_cn = cosine_similarity(c.getTranslationSentenceVectors()) np.fill_diagonal(M_cn, 0) alpha = params['alpha'] M_encn = (alpha * M_cn) + ((1 - alpha) * M_en) M_encn = _row_normalize(M_encn) mu = 0.85 # Damping factor logger.info("Iteratively computing sentence saliency scores") infoScore = normalize(np.random.random((M_cn.shape[0], ))) n = len(infoScore) for i in xrange(params['max_iter']): infoScore_prev = infoScore infoScore = (mu * np.dot(M_encn.T, infoScore)) + ((1 - mu) / n) infoScore = normalize(infoScore) if np.all(np.isclose(infoScore_prev, infoScore)): # Converged break logger.info("Optimization completed in %d iterations" % (i + 1)) # summary = optimizer.greedy(params["size"], objective, c) logger.info("Computing final sentence scores including redundancy penalty") sentence_scores = infoScore.copy() sentence_order = [] for i in xrange(len(sentence_scores)): best_sentence = np.argmax(sentence_scores) sentence_order.append(best_sentence) sentence_scores -= M_cn[:, best_sentence] * infoScore[best_sentence] sentence_scores[best_sentence] = float('-inf') logger.info("Generating final summary") sizeBudget, countTokens = params['size'] sizeName = "tokens" if countTokens else "chars" def sentenceSize(sent): return sent.tokenCount() if countTokens else sent.charCount() def summarySize(summary): return summary.tokenCount() if countTokens else summary.charCount() logger.info("Summary budget: %d %s", sizeBudget, sizeName) summary = Summary() sentences = c.getSentences() for sentence_id in sentence_order: if summarySize(summary) >= sizeBudget: break sentence_size = sentenceSize(sentences[sentence_id]) if summarySize(summary) + sentence_size <= sizeBudget: logger.info("Sentence added with size: %d, ", sentence_size) summary.addSentence(sentences[sentence_id]) logger.info("Optimization done, summary size: %d chars, %d tokens", summary.charCount(), summary.tokenCount()) return summary