Beispiel #1
0
    def parse_plays(self, filename):
        """ parses the plays found on
        http://sydney.edu.au/engineering/it/~matty/Shakespeare/ """
        temp_char = ""
        remainder = []
        empty_line = False
        start = False
        bar = ProgressBar(length=sum(1 for line in open(filename)), name="Parsing " + filename)

        with open(filename, "r") as f:
            for line in f:
                bar.update()

                if not start:
                    if "ACT I" in line:
                        start = True
                else:
                    if not line or line.isspace():
                        empty_line = True

                    else:
                        if empty_line and self._is_next_character(line):
                            tab = line.find('\t')
                            temp_char = line[:tab]
                            remainder = self._parse_play_line(line[tab + 1:], temp_char, [])
                        elif temp_char and self._is_text(line):
                            remainder = self._parse_play_line(line.strip(), temp_char, remainder)
                        empty_line = False

            bar.done()
Beispiel #2
0
    def source(self, url, **kwargs):
        soup = BeautifulSoup(request.urlopen(url))
        lines = soup.get_text().splitlines()
        bar = ProgressBar(length=len(lines), name="Parsing "+url)

        for line in lines:
            bar.update()
            self._parse_line(line)

        bar.done()
Beispiel #3
0
    def parse(self, inputfile, source, **kwargs):
        try:
            self.parser.source(inputfile, **kwargs)
        except TypeError:
            return False

        bar = ProgressBar(name="Processing " + inputfile, length=len(self.parser))

        for sentence in self.parser.get_next():
            sentence.text.insert(0, self.backend.SENTENCE_START)
            sentence.text.append(self.backend.SENTENCE_END)

            self.backend.put(sentence.text, source, sentence.char)
            bar.update()

        bar.done()