Python Document Beispiele, summary.core.Document Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: document_test.py Projekt: rajasoun/nlp

    def test_shouldReturnTokenisedSentencesFromSentences(self):
        sentence_map = {
            0: "This is a dumb sentence.",
            1: "This is a dumber sentence.",
            2: "This is a dumbest sentence.",
            3: "This is a"
        }
        text_processor = mock()
        when(text_processor).tokenize("This is a dumb sentence.").thenReturn(
            ["dumb", "sentence"])
        when(text_processor).tokenize("This is a dumber sentence.").thenReturn(
            ["dumber", "sentence"])
        when(text_processor).tokenize(
            "This is a dumbest sentence.").thenReturn(["dumbest", "sentence"])
        when(text_processor).tokenize("This is a").thenReturn([])
        expected_tokenised_sentence_map = {
            0: ["dumb", "sentence"],
            1: ["dumber", "sentence"],
            2: ["dumbest", "sentence"]
        }

        document = Document(doc_id="123",
                            text="",
                            text_processor=text_processor)
        actual_tokenised_sentence_map = document.build_tokenised_sentence_map(
            sentence_map)
        self.assertEquals(actual_tokenised_sentence_map,
                          expected_tokenised_sentence_map)

Beispiel #2

0

Datei anzeigen

Datei: document_test.py Projekt: rajasoun/nlp

    def test_shouldCreateProcessedDocument(self):
        test_text = "This is a dumb sentence. This is a dumber sentence. This is a dumbest sentence."
        sentence_map = {
            0: "This is a dumb sentence.",
            1: "This is a dumber sentence.",
            2: "This is a dumbest sentence."
        }

        tokenised_sentence_map = {
            0: ["dumb", "sentence"],
            1: ["dumber", "sentence"],
            2: ["dumbest", "sentence"]
        }
        document_tokens = [
            "dumb", "sentence", "sentence", "dumber", "dumbest", "sentence"
        ]
        expected_processed_document = ProcessedDocument(
            sentence_map=sentence_map,
            tokenised_sentence_map=tokenised_sentence_map,
            tokens=document_tokens)
        document = Document(doc_id="123", text=test_text)
        actual_processed_document = document.processed_document()

        self.assertEquals(actual_processed_document,
                          expected_processed_document)

Beispiel #3

0

Datei anzeigen

Datei: document_test.py Projekt: rajasoun/nlp

    def test_should_check_if_document_is_summarisable(self):
        mock_summarizer = mock(Summarizer())
        test_text = "This is a dumb sentence. This is a dumber sentence. This is a dumbest sentence."
        sentence_map = {
            0: "This is a dumb sentence.",
            1: "This is a dumber sentence.",
            2: "This is a dumbest sentence."
        }

        tokenised_sentence_map = {
            0: ["dumb", "sentence"],
            1: ["dumber", "sentence"],
            2: ["dumbest", "sentence"]
        }
        document_tokens = [
            "dumb", "sentence", "sentence", "dumber", "dumbest", "sentence"
        ]
        processed_document = ProcessedDocument(
            sentence_map=sentence_map,
            tokenised_sentence_map=tokenised_sentence_map,
            tokens=document_tokens)
        document = Document(doc_id="123",
                            text=test_text,
                            summarizer=mock_summarizer)
        when(mock_summarizer).is_summarizable(processed_document).thenReturn(
            True)
        document.is_summarizable()
        verify(mock_summarizer).is_summarizable(processed_document)

Beispiel #4

0

Datei anzeigen

Datei: summarizer_test.py Projekt: rajasoun/nlp

 def setUp(self):
     test_content_path = os_path.join(os_path.dirname(__file__),
                                      "../test_data/test_content.txt")
     with open(test_content_path) as test_content_file:
         text = test_content_file.read()
     document = Document(doc_id=100, text=text)
     self.processed_document = document.processed_document()

Beispiel #5

0

Datei anzeigen

Datei: document_test.py Projekt: rajasoun/nlp

    def test_shouldExtractSummary(self):
        summarizer = mock()
        document_text = "This is one summary sentence. This is what you are trying to summarize. " \
                        + "In this test it does not matter what is the kind of text you supply, this is another summary sentence. " \
                        + "That is because it uses a mock and is trying to test the behavior of api. The API sanity check for " \
                        + "summary mechanism itself is showcased in summary test"
        document = Document(doc_id="123",
                            text=document_text,
                            summarizer=summarizer)
        sentence_map = {
            0:
            'This is one summary sentence.',
            1:
            'This is what you are trying to summarize.',
            2:
            'In this test it does not matter what is the kind of text you supply, this is another summary sentence.',
            3:
            'That is because it uses a mock and is trying to test the behavior of api.',
            4:
            'The API sanity check for summary mechanism itself is showcased in summary test'
        }
        tokenised_sentence_map = {
            0: ['summary', 'sentence'],
            1: ['summarize'],
            2: [
                'test', 'matter', 'kind', 'text', 'supply', 'summary',
                'sentence'
            ],
            3: ['mock', 'test', 'behavior', 'api'],
            4: [
                'API', 'sanity', 'check', 'summary', 'mechanism', 'showcased',
                'summary', 'test'
            ]
        }
        tokens = [
            'summary', 'sentence', 'summarize', 'test', 'matter', 'kind',
            'text', 'supply', 'summary', 'sentence', 'mock', 'test',
            'behavior', 'api', 'API', 'sanity', 'check', 'summary',
            'mechanism', 'showcased', 'summary', 'test'
        ]
        processed_document = ProcessedDocument(
            sentence_map=sentence_map,
            tokenised_sentence_map=tokenised_sentence_map,
            tokens=tokens)
        summary_sentences = [
            "This is one summary sentence", "this is another summary sentence"
        ]
        expected_summary = DocumentSummary("123", summary_sentences)
        when(summarizer).summarize_using_weighing_measures(
            processed_document).thenReturn(summary_sentences)

        document_summary = document.summary()

        verify(summarizer).summarize_using_weighing_measures(
            processed_document)
        self.assertEquals(expected_summary, document_summary)

Beispiel #6

0

Datei anzeigen

Datei: summarizer_test.py Projekt: rajasoun/nlp

    def test_should_return_false_when_numericAndSymbolsFormEntireText(self):
        text_file_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         "../test_data/number_random_text.txt"))
        with open(text_file_path, "rb") as text_file:
            document_content = text_file.read()

        document = Document(text_file_path, document_content)
        processed_document = document.processed_document()
        summarizer = Summarizer()
        self.assertFalse(summarizer.is_summarizable(processed_document))

Beispiel #7

0

Datei anzeigen

Datei: summarizer_test.py Projekt: rajasoun/nlp

    def test_should_return_false_if_document_is_not_summerizable(self):
        text_file_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         "../test_data/non_summarisable_document.txt"))
        with open(text_file_path, "rb") as text_file:
            document_content = text_file.read()

        document = Document(text_file_path, document_content)
        processed_document = document.processed_document()
        summarizer = Summarizer()
        self.assertFalse(summarizer.is_summarizable(processed_document))

Beispiel #8

0

Datei anzeigen

 def test_should_check_whether_sentences_are_cohesive_using_standard_deviation_sentence_length(
         self):
     text_processor = TextProcessor()
     for name, document_content in self.text_corpus.iteritems():
         document = Document(name,
                             document_content,
                             text_processor=text_processor)
         processed_document = document.processed_document()
         weight_map = processed_document.sentence_length_map()
         cohesion = self.get_sentences_cohesion_distance(weight_map)
         print
         "########################################################################"
         print
         "SentenceLength - Cohesion for docId:%s, length: %d is %s " % (str(
             name), processed_document.number_of_sentences(), str(cohesion))
         print
         "########################################################################"

Beispiel #9

0

Datei anzeigen

 def test_should_check_whether_the_document_is_summarisable(self):
     text_processor = TextProcessor()
     correct_classification = 0
     document_classified_count = {
         "summarizable": {
             "correct": 0,
             "all": 0
         },
         "non-summarizable": {
             "correct": 0,
             "all": 0
         }
     }
     for name, document_content in self.text_corpus.iteritems():
         document = Document(name,
                             document_content,
                             text_processor=text_processor)
         summarizability = document.is_summarizable()
         document_type = self.document_type(name)
         document_classified_count[document_type]["all"] += 1
         if self.summarizability_map[document_type] == summarizability:
             correct_classification += 1
             document_classified_count[document_type]["correct"] += 1
         print
         "########################################################################"
         print
         "Document id ", name, "Summarizability:", summarizability
     print
     "############################# Summary ##################################"
     print
     "Number of Documents: ", len(self.text_corpus)
     print
     "Number of Correctly Identified Document: ", correct_classification
     print
     "Accuracy", correct_classification / float(len(self.text_corpus))
     print
     "Precision - Summarizable: ", document_classified_count[
         "summarizable"]["correct"] / float(
             document_classified_count["summarizable"]["all"])
     print
     "Precision - Non - Summarizable: ", document_classified_count[
         "non-summarizable"]["correct"] / float(
             document_classified_count["non-summarizable"]["all"])
     print
     "########################################################################"

Beispiel #10

0

Datei anzeigen

Datei: summary_handler.py Projekt: rajasoun/nlp

    def put(self):
        request_body = self.request.body
        logger.debug("Request received for summarisation; Request body: %s" %
                     request_body[0:20])
        parameters = json.loads(request_body)

        callback_url = parameters["callback"]
        document_id = parameters["documentId"]
        extracted_text = parameters["extractedText"]
        compression_ratio = int(parameters["compressionRatio"])
        summarizer = Summarizer(compression_ratio=compression_ratio)
        document = Document(doc_id=document_id,
                            text=extracted_text,
                            summarizer=summarizer)

        try:
            logger.info("Generating Summary for document %s" % document_id)
            document_summary = document.summary()
        except Exception as e:
            logger.error(
                "Error while generating summary for document %s" % document_id,
                e)
            requests.post(callback_url,
                          data=json.dumps({
                              "status": "failure",
                              "message": e.message,
                              "documentId": document_id
                          }),
                          headers={'Content-Type': 'application/json'})
        else:
            logger.info(
                "Summarisation completed for document %s. Updating to callback %s and sample summary %s"
                %
                (document_id, callback_url, document_summary.to_json()[0:20]))
            requests.post(callback_url,
                          data=(document_summary.to_json()),
                          headers={'Content-Type': 'application/json'})

Beispiel #11

0

Datei anzeigen

Datei: document_test.py Projekt: rajasoun/nlp

 def test_shouldBuildSentenceMap(self):
     test_text = "This is one summary sentences. This is what you are trying to summarize." \
                 + "In this test it does not matter what is the kind of text you supply, this is another summary sentence." \
                 + "That is because it uses a mock and is trying to test the behavior of api.The API sanity check for" \
                 + "summary mechanism itself is showcased in summary test"
     text_processor = mock()
     when(text_processor).nltk_sentences(test_text).thenReturn([
         "This is one summary sentences. This is what you are trying to summarize.",
         "In this test it does not matter what is the kind of text you supply, this is another summary sentence.",
         "That is because it uses a mock and is trying to test the behavior of api.The API sanity check for summary mechanism itself is showcased in summary test"
     ])
     document = Document(doc_id="123",
                         text=test_text,
                         text_processor=text_processor)
     expected_sentence_map = {
         0:
         "This is one summary sentences. This is what you are trying to summarize.",
         1:
         "In this test it does not matter what is the kind of text you supply, this is another summary sentence.",
         2:
         "That is because it uses a mock and is trying to test the behavior of api.The API sanity check for summary mechanism itself is showcased in summary test"
     }
     actual_sentence_map = document.build_sentence_map()
     self.assertEquals(actual_sentence_map, expected_sentence_map)

Beispiel #12

0

Datei anzeigen

Datei: summary_handler.py Projekt: rajasoun/nlp

    def check_summarizability(self):
        request_body = self.request.body
        logger.debug(
            "Request received for checking summarizability; Request body: %s" %
            request_body[0:20])
        parameters = json.loads(request_body)

        document_id = parameters["documentId"]
        extracted_text = parameters["extractedText"]
        compression_ratio = int(parameters["compressionRatio"])
        summarizer = Summarizer(compression_ratio=compression_ratio)
        document = Document(doc_id=document_id,
                            text=extracted_text,
                            summarizer=summarizer)
        logger.info("Checking summarizability for %s" % document_id)
        summarizability = document.is_summarizable()
        logger.info("Summarizability for %s is %s" %
                    (document_id, str(summarizability)))
        response = {
            "documentId": document_id,
            "summarizability": summarizability,
        }
        self.write(json.dumps(response))
        self.set_header("Content-Type", "application/json")