else:
            pos_tag_str = line_parts[1].split(",")
            pos_tag_types = map(lambda x: x.split(".")[1], pos_tag_str)
            return set(map(lambda x: PosTag[x], pos_tag_types))

    def progress(self):
        return self._line_count_progress()
 
    def _create_tokens(self, word, pos_tags, punctuation):
        word_token = WordToken(word)
        word_token.set_pos_tags(pos_tags)
        
        punctuation_token = None
        if punctuation == 'PERIOD':
            punctuation_token = PunctuationToken(punctuation, Punctuation.PERIOD)
        elif punctuation == 'COMMA':
            punctuation_token = PunctuationToken(punctuation, Punctuation.COMMA)

        if punctuation_token is not None:
            return [word_token, punctuation_token]
        return [word_token]



################
# Example call #
################

if __name__ == '__main__':
    parse_command_line_arguments(LineParser)
            last_end = token.begin + token.duration
            last_token = token

        return audio

    def _extract_talk_id(self, line):
        line = line[2:]
        line_parts = line.split("talkid")
        relevant = line_parts[1]

        talkid = "0"
        for i in range(0, len(relevant)):
            if relevant[i].isdigit():
                talkid += relevant[i]
            else:
                break

        return int(talkid)

    def progress(self):
        return self._line_count_progress()


################
# Example call #
################

if __name__ == "__main__":
    parse_command_line_arguments(CtmParser)
Exemple #3
0
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text

    def progress(self):
        return self._line_count_progress()


################
# Example call #
################

if __name__ == '__main__':
    parse_command_line_arguments(PlaintextParser)
Exemple #4
0
            for sentence in doc.findall("seg"):
                sentence_text = unicode(sentence.text)

                sentence = Sentence()
                sentence.set_sentence_text(sentence_text)
                sentence.set_tokens(
                    self.nlp_pipeline.parse_text(sentence_text))
                talk.add_sentence(sentence)

            yield talk

    def progress(self):
        return self._line_count_progress()

    def _count_docs(self):
        mteval = xml.etree.ElementTree.parse(self.filename).getroot()
        srcset = mteval.find("srcset")
        i = 0
        for doc in srcset.findall('doc'):
            i += 1
        return i


################
# Example call #
################

if __name__ == '__main__':
    parse_command_line_arguments(XMLParser)
Exemple #5
0
            last_end = token.begin + token.duration
            last_token = token

        return audio

    def _extract_talk_id(self, line):
        line = line[2:]
        line_parts = line.split("talkid")
        relevant = line_parts[1]

        talkid = "0"
        for i in range(0, len(relevant)):
            if relevant[i].isdigit():
                talkid += relevant[i]
            else:
                break

        return int(talkid)

    def progress(self):
        return self._line_count_progress()


################
# Example call #
################

if __name__ == '__main__':
    parse_command_line_arguments(CtmParser)
        text = Text()

        with open(self.filename, "r") as file_:
            for line_unenc in file_:
                self._progress += 1
                line = unicode(line_unenc.encode('utf8'))
                if line.startswith(TEXT_SEPARATOR):
                    if (len(text.sentences) > 0):
                        yield text
                        text = Text()
                        continue
                sentences = self.nlp_pipeline.sentence_segmentation(line)
                for sentence in sentences:
                    s = Sentence()
                    s.set_sentence_text(sentence)
                    s.set_tokens(self.nlp_pipeline.parse_text(sentence))
                    text.add_sentence(s)
        if (len(text.sentences) > 0):
            yield text

    def progress(self):
        return self._line_count_progress()


################
# Example call #
################

if __name__ == '__main__':
    parse_command_line_arguments(PlaintextParser)