Exemple #1
0
def test_paragraph_splitting():
    splitter = ParagraphSplitter()

    my_dir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(my_dir, 'test_paragraph_splitting.txt')) as f:
        text = f.read()

    paragraphs = splitter.find_paragraphs(text)

    assert len(paragraphs) == 10

    for i, paragraph in enumerate(paragraphs):
        assert 'paragraph {}'.format(i + 1) in paragraph
        assert paragraph[-1] == "."

        # Make sure newlines have been removed within each paragraph.
        assert '\n' not in paragraph

    # Make the total number of non-whitespace characters is the same.
    assert len(re.sub(r'\s', '', text)) \
        == len(''.join([re.sub(r'\s', '', x) for x in paragraphs]))
    def parse_document(self, doc_dict):
        doc_id = doc_dict["doc_id"]
        logging.info('syntax parsing, doc_id = {}'.format(doc_id))

        # TODO should there be some extra preprocessing to deal with fancy
        # quotes, etc.? The tokenizer doesn't appear to handle it well
        paragraphs = ParagraphSplitter.find_paragraphs(doc_dict["raw_text"],
                                                       doc_id=doc_id)

        starts_paragraph_list = []
        trees = []
        no_parse_for_paragraph = False
        for paragraph in paragraphs:
            # try to use the server first
            if self._zpar_proxy:
                trees_p = self._parse_document_via_server(paragraph, doc_id)
            # then fall back to the shared library
            else:
                if self._zpar_ref is None:
                    raise RuntimeError('The ZPar server is unavailable.')
                trees_p = self._parse_document_via_lib(paragraph, doc_id)

            if len(trees_p) > 0:
                starts_paragraph_list.append(True)
                starts_paragraph_list.extend([False for t in trees_p[1:]])
                trees.extend(trees_p)
            else:
                # TODO add some sort of error flag to the dictionary for this
                # document?
                no_parse_for_paragraph = True

        logging.debug('starts_paragraph_list = {}, doc_id = {}'
                      .format(starts_paragraph_list, doc_id))

        # Check that either the number of True indicators in
        # starts_paragraph_list equals the number of paragraphs, or that the
        # syntax parser had to skip a paragraph entirely.
        assert (sum(starts_paragraph_list) == len(paragraphs)
                or no_parse_for_paragraph)
        assert len(trees) == len(starts_paragraph_list)

        return trees, starts_paragraph_list
    def parse_document(self, doc_dict):
        doc_id = doc_dict["doc_id"]
        logging.info('syntax parsing, doc_id = {}'.format(doc_id))

        # TODO should there be some extra preprocessing to deal with fancy
        # quotes, etc.? The tokenizer doesn't appear to handle it well
        paragraphs = ParagraphSplitter.find_paragraphs(doc_dict["raw_text"],
                                                       doc_id=doc_id)

        starts_paragraph_list = []
        trees = []
        no_parse_for_paragraph = False
        for paragraph in paragraphs:
            # try to use the server first
            if self._zpar_proxy:
                trees_p = self._parse_document_via_server(paragraph, doc_id)
            # then fall back to the shared library
            else:
                if self._zpar_ref is None:
                    raise RuntimeError('The ZPar server is unavailable.')
                trees_p = self._parse_document_via_lib(paragraph, doc_id)

            if len(trees_p) > 0:
                starts_paragraph_list.append(True)
                starts_paragraph_list.extend([False for t in trees_p[1:]])
                trees.extend(trees_p)
            else:
                # TODO add some sort of error flag to the dictionary for this
                # document?
                no_parse_for_paragraph = True

        logging.debug('starts_paragraph_list = {}, doc_id = {}'
                      .format(starts_paragraph_list, doc_id))

        # Check that either the number of True indicators in
        # starts_paragraph_list equals the number of paragraphs, or that the
        # syntax parser had to skip a paragraph entirely.
        assert (sum(starts_paragraph_list) == len(paragraphs)
                or no_parse_for_paragraph)
        assert len(trees) == len(starts_paragraph_list)

        return trees, starts_paragraph_list