Exemple #1
0
 def open(self, filepath):
     super(StanfordParsedSentenceReader, self).open(filepath)
     base_path, _ = os.path.splitext(filepath)
     parse_file_name = base_path + '.parse'
     if FLAGS.reader_gold_parses:
         non_gold_file_name = parse_file_name
         parse_file_name += ".gold"
     try:
         self._parse_file = CharacterTrackingStreamWrapper(
             io.open(parse_file_name, 'rb'), FLAGS.reader_codec)
     except:
         if FLAGS.reader_gold_parses and FLAGS.gold_parses_fallback:
             logging.info("Falling back to non-gold parse for %s", filepath)
             self._parse_file = CharacterTrackingStreamWrapper(
                 io.open(non_gold_file_name, 'rb'), FLAGS.reader_codec)
         else:
             raise
Exemple #2
0
 def open(self, filepath):
     self.close()
     self._file_stream = CharacterTrackingStreamWrapper(
         io.open(filepath, 'rb'), FLAGS.reader_codec)
Exemple #3
0
class CharacterTrackingStreamWrapperTest(unittest.TestCase):
    data_str = (
        "Guinea has been in turmoil for more than a month,\nas labor"
        " unions and other civic groups have demanded that the ailing leader,"
        " Lansana Cont\xc3\xa9, step aside.")

    @classmethod
    def setUpClass(cls):
        cls.data_decoded = cls.data_str.decode('utf-8')
        cls.data_decoded_split = cls.data_decoded.split('\n')
        cls.data_decoded_split[0] += '\n'

    def setUp(self):
        self.document = CharacterTrackingStreamWrapper(BytesIO(self.data_str),
                                                       'utf-8')

    def test_basic_read(self):
        read_data = self.document.read()
        self.assertEqual(read_data, self.data_decoded)
        self.assertEqual(len(read_data), self.document.character_position)

    def test_readline(self):
        read_data = self.document.readline()
        self.assertEqual(read_data, self.data_decoded_split[0])
        self.assertEqual(len(read_data), self.document.character_position)

    def test_readlines(self):
        self.assertEqual(self.document.readlines(), self.data_decoded_split)
        self.assertEqual(len(self.data_decoded),
                         self.document.character_position)

    def test_iteration(self):
        self.assertEqual([line for line in self.document],
                         self.data_decoded_split)
        self.assertEqual(len(self.data_decoded),
                         self.document.character_position)

    def test_seek_to_ends(self):
        self.document.seek(0, SEEK_END)
        self.assertEqual(len(self.data_decoded),
                         self.document.character_position)

        # Test seeking to start
        self.document.seek(0)
        self.assertEqual(0, self.document.character_position)

        # Test seeking to end from the middle
        self.document.read(41)
        self.document.seek(0, SEEK_END)
        self.assertEqual(len(self.data_decoded),
                         self.document.character_position)

    def test_seek_to_saved(self):
        self.document.read(41)
        self.assertEqual(41, self.document.character_position)

        # Try seeking repeatedly in case seeking back & forth messes things up.
        for _ in range(2):
            saved = self.document.tell()
            self.assertEqual(self.document.readline(), u'a month,\n')
            self.assertEqual(50, self.document.character_position)
            self.document.seek(saved)
            self.assertEqual(41, self.document.character_position)
            self.assertEqual(saved, self.document.tell())

        # Now do the same for a range that includes a Unicode character.
        self.document.readline()  # move back to end of line
        # Read to start of 'aside', which starts at character 147 in Unicode
        read_data = self.document.read(97)
        self.assertEqual(self.data_decoded_split[1][:97], read_data)
        self.assertEqual(147, self.document.character_position)
        for _ in range(2):
            saved = self.document.tell()
            self.assertEqual(self.document.readline(), u'aside.')
            self.assertEqual(len(self.data_decoded),
                             self.document.character_position)
            self.document.seek(saved)
            self.assertEqual(147, self.document.character_position)
            self.assertEqual(saved, self.document.tell())
Exemple #4
0
class StanfordParsedSentenceReader(DocumentReader):
    '''
    Reads a single text document, along with pre-parsed Stanford parser output
    for that file. Returns one SentencesDocument of StanfordParsedSentences per
    file.
    '''
    def __init__(self, filepath=None, sentence_class=StanfordParsedSentence):
        if not issubclass(sentence_class, StanfordParsedSentence):
            raise TypeError("StanfordParsedSentenceReader can only parse to"
                            " subclasses of StanfordParsedSentence")
        self._parse_file = None
        self.sentence_class = sentence_class
        super(StanfordParsedSentenceReader, self).__init__(filepath)

    def open(self, filepath):
        super(StanfordParsedSentenceReader, self).open(filepath)
        base_path, _ = os.path.splitext(filepath)
        parse_file_name = base_path + '.parse'
        if FLAGS.reader_gold_parses:
            non_gold_file_name = parse_file_name
            parse_file_name += ".gold"
        try:
            self._parse_file = CharacterTrackingStreamWrapper(
                io.open(parse_file_name, 'rb'), FLAGS.reader_codec)
        except:
            if FLAGS.reader_gold_parses and FLAGS.gold_parses_fallback:
                logging.info("Falling back to non-gold parse for %s", filepath)
                self._parse_file = CharacterTrackingStreamWrapper(
                    io.open(non_gold_file_name, 'rb'), FLAGS.reader_codec)
            else:
                raise

    def close(self):
        super(StanfordParsedSentenceReader, self).close()
        if self._parse_file:
            self._parse_file.close()

    def get_next(self):
        sentences = []
        while True:
            next_sentence = self.get_next_sentence()
            if next_sentence is None:  # end of file
                break
            sentences.append(next_sentence)

        if sentences:  # There were some sentences in the file
            return SentencesDocument(self._file_stream.name, sentences)
        else:
            return None

    def get_next_sentence(self):
        if not self._parse_file:
            return None
        # Read the next 3 blocks of the parse file.
        tokenized = self._parse_file.readline()
        if not tokenized:  # empty string means we've hit the end of the file
            return None
        tokenized = tokenized.strip()
        tmp = self._parse_file.readline()
        assert not tmp.strip(), (
            'Invalid parse file: expected blank line after tokens: %s' %
            tokenized).encode('ascii', 'replace')

        lemmas = self._parse_file.readline()
        lemmas = lemmas.strip()
        assert lemmas, (
            'Invalid parse file: expected lemmas line after tokens: %s' %
            tokenized).encode('ascii', 'replace')
        tmp = self._parse_file.readline()
        assert not tmp.strip(), (
            'Invalid parse file: expected blank line after lemmas: %s' %
            lemmas).encode('ascii', 'replace')

        # If the sentence was unparsed, don't return a new sentence object for
        # it, but do advance the stream past the unparsed words.
        # NOTE: This relies on the printWordsForUnparsed flag we introduced to
        # the Stanford parser.
        if lemmas == '(())':
            self.__skip_tokens(tokenized, 'Ignoring unparsed sentence')
            return self.get_next()

        # Process the constituency parse, if present.
        if peek_and_revert_unless(self._parse_file, lambda x: False)[0] == '(':
            constituency_parse, double_newline_found = read_stream_until(
                self._parse_file, '\n\n')
            assert double_newline_found, (
                'Invalid parse file: expected blank line after constituency parse: %s'
                % constituency_parse).encode('ascii', 'replace')
        else:
            constituency_parse = None

        parse_lines = []
        tmp = self._parse_file.readline().strip()
        if not tmp:
            self.__skip_tokens(
                tokenized, 'Skipping sentence with empty dependency parse')
            return self.get_next()
        while tmp:
            parse_lines.append(tmp)
            tmp = self._parse_file.readline().strip()

        # Leaves file in the state where the final blank line after the edges
        # has been read. This also means that if there's a blank line at the end
        # of a file, it won't make us think there's another entry coming.

        # Now create the sentence from the read data + the text file.
        sentence = self.sentence_class(tokenized, lemmas, constituency_parse,
                                       parse_lines, self._file_stream)
        assert (len(
            sentence.original_text) == self._file_stream.character_position -
                sentence.document_char_offset), (
                    'Sentence length != offset difference: %s' %
                    sentence.original_text).encode('ascii', 'replace')
        return sentence

    def __skip_tokens(self, tokenized, message):
        print '%s: %s' % (message, tokenized)
        for token in tokenized.split():
            unescaped = StanfordParsedSentence.unescape_token_text(token)
            _, found_token = read_stream_until(self._parse_file, unescaped,
                                               False, False)
            assert found_token, ('Skipped token not found: %s' %
                                 unescaped).encode('ascii', 'replace')
Exemple #5
0
 def setUp(self):
     self.document = CharacterTrackingStreamWrapper(BytesIO(self.data_str),
                                                    'utf-8')