Beispiel #1
0
    def test_shouldReturnTokenisedSentencesFromSentences(self):
        sentence_map = {
            0: "This is a dumb sentence.",
            1: "This is a dumber sentence.",
            2: "This is a dumbest sentence.",
            3: "This is a"
        }
        text_processor = mock()
        when(text_processor).tokenize("This is a dumb sentence.").thenReturn(
            ["dumb", "sentence"])
        when(text_processor).tokenize("This is a dumber sentence.").thenReturn(
            ["dumber", "sentence"])
        when(text_processor).tokenize(
            "This is a dumbest sentence.").thenReturn(["dumbest", "sentence"])
        when(text_processor).tokenize("This is a").thenReturn([])
        expected_tokenised_sentence_map = {
            0: ["dumb", "sentence"],
            1: ["dumber", "sentence"],
            2: ["dumbest", "sentence"]
        }

        document = Document(doc_id="123",
                            text="",
                            text_processor=text_processor)
        actual_tokenised_sentence_map = document.build_tokenised_sentence_map(
            sentence_map)
        self.assertEquals(actual_tokenised_sentence_map,
                          expected_tokenised_sentence_map)
Beispiel #2
0
    def test_shouldCreateProcessedDocument(self):
        test_text = "This is a dumb sentence. This is a dumber sentence. This is a dumbest sentence."
        sentence_map = {
            0: "This is a dumb sentence.",
            1: "This is a dumber sentence.",
            2: "This is a dumbest sentence."
        }

        tokenised_sentence_map = {
            0: ["dumb", "sentence"],
            1: ["dumber", "sentence"],
            2: ["dumbest", "sentence"]
        }
        document_tokens = [
            "dumb", "sentence", "sentence", "dumber", "dumbest", "sentence"
        ]
        expected_processed_document = ProcessedDocument(
            sentence_map=sentence_map,
            tokenised_sentence_map=tokenised_sentence_map,
            tokens=document_tokens)
        document = Document(doc_id="123", text=test_text)
        actual_processed_document = document.processed_document()

        self.assertEquals(actual_processed_document,
                          expected_processed_document)
Beispiel #3
0
    def test_should_check_if_document_is_summarisable(self):
        mock_summarizer = mock(Summarizer())
        test_text = "This is a dumb sentence. This is a dumber sentence. This is a dumbest sentence."
        sentence_map = {
            0: "This is a dumb sentence.",
            1: "This is a dumber sentence.",
            2: "This is a dumbest sentence."
        }

        tokenised_sentence_map = {
            0: ["dumb", "sentence"],
            1: ["dumber", "sentence"],
            2: ["dumbest", "sentence"]
        }
        document_tokens = [
            "dumb", "sentence", "sentence", "dumber", "dumbest", "sentence"
        ]
        processed_document = ProcessedDocument(
            sentence_map=sentence_map,
            tokenised_sentence_map=tokenised_sentence_map,
            tokens=document_tokens)
        document = Document(doc_id="123",
                            text=test_text,
                            summarizer=mock_summarizer)
        when(mock_summarizer).is_summarizable(processed_document).thenReturn(
            True)
        document.is_summarizable()
        verify(mock_summarizer).is_summarizable(processed_document)
Beispiel #4
0
 def setUp(self):
     test_content_path = os_path.join(os_path.dirname(__file__),
                                      "../test_data/test_content.txt")
     with open(test_content_path) as test_content_file:
         text = test_content_file.read()
     document = Document(doc_id=100, text=text)
     self.processed_document = document.processed_document()
Beispiel #5
0
    def test_shouldExtractSummary(self):
        summarizer = mock()
        document_text = "This is one summary sentence. This is what you are trying to summarize. " \
                        + "In this test it does not matter what is the kind of text you supply, this is another summary sentence. " \
                        + "That is because it uses a mock and is trying to test the behavior of api. The API sanity check for " \
                        + "summary mechanism itself is showcased in summary test"
        document = Document(doc_id="123",
                            text=document_text,
                            summarizer=summarizer)
        sentence_map = {
            0:
            'This is one summary sentence.',
            1:
            'This is what you are trying to summarize.',
            2:
            'In this test it does not matter what is the kind of text you supply, this is another summary sentence.',
            3:
            'That is because it uses a mock and is trying to test the behavior of api.',
            4:
            'The API sanity check for summary mechanism itself is showcased in summary test'
        }
        tokenised_sentence_map = {
            0: ['summary', 'sentence'],
            1: ['summarize'],
            2: [
                'test', 'matter', 'kind', 'text', 'supply', 'summary',
                'sentence'
            ],
            3: ['mock', 'test', 'behavior', 'api'],
            4: [
                'API', 'sanity', 'check', 'summary', 'mechanism', 'showcased',
                'summary', 'test'
            ]
        }
        tokens = [
            'summary', 'sentence', 'summarize', 'test', 'matter', 'kind',
            'text', 'supply', 'summary', 'sentence', 'mock', 'test',
            'behavior', 'api', 'API', 'sanity', 'check', 'summary',
            'mechanism', 'showcased', 'summary', 'test'
        ]
        processed_document = ProcessedDocument(
            sentence_map=sentence_map,
            tokenised_sentence_map=tokenised_sentence_map,
            tokens=tokens)
        summary_sentences = [
            "This is one summary sentence", "this is another summary sentence"
        ]
        expected_summary = DocumentSummary("123", summary_sentences)
        when(summarizer).summarize_using_weighing_measures(
            processed_document).thenReturn(summary_sentences)

        document_summary = document.summary()

        verify(summarizer).summarize_using_weighing_measures(
            processed_document)
        self.assertEquals(expected_summary, document_summary)
Beispiel #6
0
    def test_should_return_false_when_numericAndSymbolsFormEntireText(self):
        text_file_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         "../test_data/number_random_text.txt"))
        with open(text_file_path, "rb") as text_file:
            document_content = text_file.read()

        document = Document(text_file_path, document_content)
        processed_document = document.processed_document()
        summarizer = Summarizer()
        self.assertFalse(summarizer.is_summarizable(processed_document))
Beispiel #7
0
    def test_should_return_false_if_document_is_not_summerizable(self):
        text_file_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         "../test_data/non_summarisable_document.txt"))
        with open(text_file_path, "rb") as text_file:
            document_content = text_file.read()

        document = Document(text_file_path, document_content)
        processed_document = document.processed_document()
        summarizer = Summarizer()
        self.assertFalse(summarizer.is_summarizable(processed_document))
Beispiel #8
0
 def test_should_check_whether_sentences_are_cohesive_using_standard_deviation_sentence_length(
         self):
     text_processor = TextProcessor()
     for name, document_content in self.text_corpus.iteritems():
         document = Document(name,
                             document_content,
                             text_processor=text_processor)
         processed_document = document.processed_document()
         weight_map = processed_document.sentence_length_map()
         cohesion = self.get_sentences_cohesion_distance(weight_map)
         print
         "########################################################################"
         print
         "SentenceLength - Cohesion for docId:%s, length: %d is %s " % (str(
             name), processed_document.number_of_sentences(), str(cohesion))
         print
         "########################################################################"
Beispiel #9
0
 def test_should_check_whether_the_document_is_summarisable(self):
     text_processor = TextProcessor()
     correct_classification = 0
     document_classified_count = {
         "summarizable": {
             "correct": 0,
             "all": 0
         },
         "non-summarizable": {
             "correct": 0,
             "all": 0
         }
     }
     for name, document_content in self.text_corpus.iteritems():
         document = Document(name,
                             document_content,
                             text_processor=text_processor)
         summarizability = document.is_summarizable()
         document_type = self.document_type(name)
         document_classified_count[document_type]["all"] += 1
         if self.summarizability_map[document_type] == summarizability:
             correct_classification += 1
             document_classified_count[document_type]["correct"] += 1
         print
         "########################################################################"
         print
         "Document id ", name, "Summarizability:", summarizability
     print
     "############################# Summary ##################################"
     print
     "Number of Documents: ", len(self.text_corpus)
     print
     "Number of Correctly Identified Document: ", correct_classification
     print
     "Accuracy", correct_classification / float(len(self.text_corpus))
     print
     "Precision - Summarizable: ", document_classified_count[
         "summarizable"]["correct"] / float(
             document_classified_count["summarizable"]["all"])
     print
     "Precision - Non - Summarizable: ", document_classified_count[
         "non-summarizable"]["correct"] / float(
             document_classified_count["non-summarizable"]["all"])
     print
     "########################################################################"
Beispiel #10
0
    def put(self):
        request_body = self.request.body
        logger.debug("Request received for summarisation; Request body: %s" %
                     request_body[0:20])
        parameters = json.loads(request_body)

        callback_url = parameters["callback"]
        document_id = parameters["documentId"]
        extracted_text = parameters["extractedText"]
        compression_ratio = int(parameters["compressionRatio"])
        summarizer = Summarizer(compression_ratio=compression_ratio)
        document = Document(doc_id=document_id,
                            text=extracted_text,
                            summarizer=summarizer)

        try:
            logger.info("Generating Summary for document %s" % document_id)
            document_summary = document.summary()
        except Exception as e:
            logger.error(
                "Error while generating summary for document %s" % document_id,
                e)
            requests.post(callback_url,
                          data=json.dumps({
                              "status": "failure",
                              "message": e.message,
                              "documentId": document_id
                          }),
                          headers={'Content-Type': 'application/json'})
        else:
            logger.info(
                "Summarisation completed for document %s. Updating to callback %s and sample summary %s"
                %
                (document_id, callback_url, document_summary.to_json()[0:20]))
            requests.post(callback_url,
                          data=(document_summary.to_json()),
                          headers={'Content-Type': 'application/json'})
Beispiel #11
0
 def test_shouldBuildSentenceMap(self):
     test_text = "This is one summary sentences. This is what you are trying to summarize." \
                 + "In this test it does not matter what is the kind of text you supply, this is another summary sentence." \
                 + "That is because it uses a mock and is trying to test the behavior of api.The API sanity check for" \
                 + "summary mechanism itself is showcased in summary test"
     text_processor = mock()
     when(text_processor).nltk_sentences(test_text).thenReturn([
         "This is one summary sentences. This is what you are trying to summarize.",
         "In this test it does not matter what is the kind of text you supply, this is another summary sentence.",
         "That is because it uses a mock and is trying to test the behavior of api.The API sanity check for summary mechanism itself is showcased in summary test"
     ])
     document = Document(doc_id="123",
                         text=test_text,
                         text_processor=text_processor)
     expected_sentence_map = {
         0:
         "This is one summary sentences. This is what you are trying to summarize.",
         1:
         "In this test it does not matter what is the kind of text you supply, this is another summary sentence.",
         2:
         "That is because it uses a mock and is trying to test the behavior of api.The API sanity check for summary mechanism itself is showcased in summary test"
     }
     actual_sentence_map = document.build_sentence_map()
     self.assertEquals(actual_sentence_map, expected_sentence_map)
Beispiel #12
0
    def check_summarizability(self):
        request_body = self.request.body
        logger.debug(
            "Request received for checking summarizability; Request body: %s" %
            request_body[0:20])
        parameters = json.loads(request_body)

        document_id = parameters["documentId"]
        extracted_text = parameters["extractedText"]
        compression_ratio = int(parameters["compressionRatio"])
        summarizer = Summarizer(compression_ratio=compression_ratio)
        document = Document(doc_id=document_id,
                            text=extracted_text,
                            summarizer=summarizer)
        logger.info("Checking summarizability for %s" % document_id)
        summarizability = document.is_summarizable()
        logger.info("Summarizability for %s is %s" %
                    (document_id, str(summarizability)))
        response = {
            "documentId": document_id,
            "summarizability": summarizability,
        }
        self.write(json.dumps(response))
        self.set_header("Content-Type", "application/json")