Ejemplo n.º 1
0
    def getPOS(self):
        if len(self.pos_tags) > 0:
            return self.pos_tags

        fname = 'cache/pos.%s.set%d.pickle' % (self.file_name, self.essay_set)
        try:
            f = open(fname, 'rb')
            self.pos_tags = pickle.load(f)
        except:
            pos_lines = list()
            tot_ln = self.size()
            prog = 0
            hunpos = nltk.tag.HunposTagger("en_wsj.model")

            for line in self.getRawText():
                tokens = LanguageUtils.punkt_tokenize(line)
                pos_tags = hunpos.tag(tokens)
                tags_only = [tag for w, tag in pos_tags]
                pos_lines.append(tags_only)
                prog += 1
                if prog % 100 == 0:
                    print "POS Tagging %d of %d" % (prog, tot_ln)

                self.pos_tags = pos_lines

            f = open(fname, 'w')
            pickle.dump(self.pos_tags, f)

        return self.pos_tags