Beispiel #1
0
    def test(self, e):
        k = 3
        # read files and turn it into a sentences x words matrix

        matrix = {}

        for idx in range(len(e.sentences)):
            matrix[idx] = {}
            words = re.split('\W+', e.sentences[idx])
            for word in words:
                if len(word) == 0:  # skip ""
                    continue
                if word not in matrix[idx]:
                    matrix[idx][word] = 0
                matrix[idx][word] += 1

        # complete svd here
        arr = pd.DataFrame.from_dict(matrix).fillna(0).values
        # u, s, vh = np.linalg.svd(arr, full_matrices=True) # full matrix
        u, s, vh = np.linalg.svd(arr, full_matrices=False)  # reduced matrix
        # print u.shape
        # print s.shape
        # print vh.shape
        # print np.allclose(arr, np.dot(u * s, vh))
        # print vh[:, k]
        threshold = 0.5

        sigma_threshold = max(s) * threshold
        s[s < sigma_threshold] = 0  # Set all other singular values to zero

        saliency_vec = np.dot(
            np.square(s), np.square(vh)
        )  # Build a "length vector" containing the length (i.e. saliency) of each sentence
        top_sentences = saliency_vec.argsort()[-k:][::-1]

        top_sentences.sort(
        )  # Return the sentences in the order in which they appear in the document

        pred = [e.sentences[i] for i in top_sentences]
        # print pred
        evaluate = Evaluator()
        [P_1, R_1, F1_1] = evaluate.ROUGE1(pred=(pred), test=(e.ground_truths))
        [P_2, R_2, F1_2] = evaluate.ROUGE2(pred=(pred), test=(e.ground_truths))

        return P_1, R_1, F1_1, P_2, R_2, F1_2