Esempio n. 1
0
 def derivation_with_index(self, filename, index=None):
     with open(filename, 'r') as file:
         if index:
             return self.parse_file(''.join(
                 nth_occurrence(file.xreadlines(),
                                   N=index, 
                                   when=lambda line: re.match(r"^\(", line),
                                   until=lambda line: re.match(r"^\(", line))))
         else:
             return self.parse_file(file.read())
Esempio n. 2
0
 def derivation_with_index(self, filename, index=None):
     with open(filename, 'r') as file:
         if index:
             return self.parse_file(''.join(
                 nth_occurrence(file.xreadlines(),
                                N=index,
                                when=lambda line: re.match(r"^\(", line),
                                until=lambda line: re.match(r"^\(", line))))
         else:
             return self.parse_file(file.read())
Esempio n. 3
0
 def derivation_with_index(self, filename, index=None):
     self.file = open(filename, 'r')
     
     base = imap(lambda line: line.rstrip(), self.file.xreadlines())
     if index:
         lines = nth_occurrence(base,
                               N=1,
                               # put a space after the pattern to ensure we match the whole token
                               when=lambda line: re.match(r"^ID=wsj_%02d%02d.%d " % (self.sec_no, self.doc_no, index), line),
                               until=lambda line: re.match(r"^ID", line))
         return iter(lines)
     else:
         return base
Esempio n. 4
0
    def derivation_with_index(self, filename, index=None):
        self.file = open(filename, "r")

        base = imap(lambda line: line.rstrip(), self.file.xreadlines())
        if index:
            lines = nth_occurrence(
                base,
                N=1,
                # put a space after the pattern to ensure we match the whole token
                when=lambda line: re.match(r"^ID=wsj_%02d%02d.%d " % (self.sec_no, self.doc_no, index), line),
                until=lambda line: re.match(r"^ID", line),
            )
            return iter(lines)
        else:
            return base
Esempio n. 5
0
 def derivation_with_index(self, filename, i=None):
     self.contents = SGMLBag()
     with open(filename, 'r') as file:
         if i:
             text = ''.join(nth_occurrence(file.xreadlines(),
                                   N=i,
                                   when=lambda line: re.match(r'^<S', line),
                                   until=lambda line: re.match(r'^</S', line)))
         else:
             text = file.read()
             
         self.contents.feed(text)
     
     # HACK HACK HACK:
     # Sometimes <S>...</S> encloses more than one root (3:7 has some);
     # in which case, counting <S> will undercount the number of sentences
     if self.contents['s'] is None: return parse_tree('', AugmentedPennParser)
     
     return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
Esempio n. 6
0
    def derivation_with_index(self, filename, i=None):
        self.contents = SGMLBag()
        with open(filename, 'r') as file:
            headline_lines = nth_occurrence(file, N=1, 
                             when=lambda line:  re.match(r'^<HEADLINE', line),
                             until=lambda line: re.match(r'^</HEADLINE', line))
            if not headline_lines: return None

            if not headline_lines[0].startswith('<HEADLINE'):
                raise CPTBParseException('Expected to find a <HEADLINE> line.')
                
            headline_lines = headline_lines[1:] # strip off <HEADLINE>
            if i:
                text = ''.join(headline_lines[i])
            else:
                text = '\n'.join(headline_lines)

            self.contents.feed(text)

        return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
Esempio n. 7
0
    def derivation_with_index(self, filename, i=None):
        self.contents = SGMLBag()
        with open(filename, 'r') as file:
            if i:
                text = ''.join(
                    nth_occurrence(file.xreadlines(),
                                   N=i,
                                   when=lambda line: re.match(r'^<S', line),
                                   until=lambda line: re.match(r'^</S', line)))
            else:
                text = file.read()

            self.contents.feed(text)

        # HACK HACK HACK:
        # Sometimes <S>...</S> encloses more than one root (3:7 has some);
        # in which case, counting <S> will undercount the number of sentences
        if self.contents['s'] is None:
            return parse_tree('', AugmentedPennParser)

        return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
Esempio n. 8
0
    def derivation_with_index(self, filename, i=None):
        self.contents = SGMLBag()
        with open(filename, 'r') as file:
            headline_lines = nth_occurrence(
                file,
                N=1,
                when=lambda line: re.match(r'^<HEADLINE', line),
                until=lambda line: re.match(r'^</HEADLINE', line))
            if not headline_lines: return None

            if not headline_lines[0].startswith('<HEADLINE'):
                raise CPTBParseException('Expected to find a <HEADLINE> line.')

            headline_lines = headline_lines[1:]  # strip off <HEADLINE>
            if i:
                text = ''.join(headline_lines[i])
            else:
                text = '\n'.join(headline_lines)

            self.contents.feed(text)

        return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)