Ejemplo n.º 1
0
 def read_sentence(self, filename):
     # print("________________________________________________________________________________________________read_sentence starts")
     sentence = Sentence()
     for line in open(filename):
         line = line.strip()
         if line:
             sentence.add_token(line)
         elif len(sentence) != 1:
             yield sentence
             sentence = Sentence(
             )  # print("________________________________________________________________________________________________read_sentence ends")
Ejemplo n.º 2
0
    def __get_sentence(self, filename):
        """Legge una frase dal treebank e la restituisce sottoforma di istanza della classe Sentence """

        sentence = Sentence()

        with open(filename) as f:
            for line in f:
                token = self.__parse_line(line)
                if token:
                    sentence.add_token(token)
                else:
                    yield sentence
                    sentence.clear()
Ejemplo n.º 3
0
class CoreNLPTarget(object):
    def __init__(self):
        self.sents = []
        self.corefs = []
        self.sent = None
        self.coref = None
        self.tag = ''
        self.word = ''
        self.lemma = ''
        self.pos = ''
        self.ner = ''
        self.dep_label = ''
        self.gov_idx = -1
        self.dep_idx = -1
        self.extra = False
        self.sent_idx = -1
        self.start_token_idx = -1
        self.end_token_idx = -1
        self.head_token_idx = -1
        self.rep = False
        self.text = ''
        self.parse_sent = False
        self.parse_dep = False
        self.parse_coref = False
        self.copied_dep = False

    def start(self, tag, attrib):
        self.tag = tag
        if tag == 'sentences':
            self.parse_sent = True
        elif tag == 'sentence':
            if self.parse_sent:
                self.sent = Sentence(int(attrib['id']) - 1)
        elif tag == 'dependencies':
            if attrib['type'] == consts.corenlp_dependency_type \
                    and self.parse_sent:
                self.parse_dep = True
                self.copied_dep = False
        elif tag == 'dep':
            if self.parse_dep:
                self.dep_label = attrib['type']
                if 'extra' in attrib:
                    self.extra = True
        elif tag == 'governor':
            if self.parse_dep:
                self.gov_idx = int(attrib['idx']) - 1
                if 'copy' in attrib:
                    self.copied_dep = True
        elif tag == 'dependent':
            if self.parse_dep:
                self.dep_idx = int(attrib['idx']) - 1
                if 'copy' in attrib:
                    self.copied_dep = True
        elif tag == 'coreference':
            if not self.parse_coref:
                self.parse_coref = True
            self.coref = Coreference(len(self.corefs))
        elif tag == 'mention':
            if self.parse_coref:
                if 'representative' in attrib:
                    self.rep = True

    def data(self, data):
        data = data.strip()
        if data != '':
            if self.parse_sent:
                if self.tag == 'word':
                    self.word += data
                elif self.tag == 'lemma':
                    self.lemma += data
                elif self.tag == 'POS':
                    self.pos += data
                elif self.tag == 'NER':
                    self.ner += data
            elif self.parse_coref:
                if self.tag == 'sentence':
                    self.sent_idx = int(data) - 1
                elif self.tag == 'start':
                    self.start_token_idx = int(data) - 1
                elif self.tag == 'end':
                    self.end_token_idx = int(data) - 1
                elif self.tag == 'head':
                    self.head_token_idx = int(data) - 1
                elif self.tag == 'text':
                    self.text += data

    def end(self, tag):
        self.tag = ''
        if tag == 'sentences':
            if self.parse_sent:
                self.parse_sent = False
        elif tag == 'sentence':
            if self.parse_sent:
                if self.sent is not None:
                    self.sents.append(deepcopy(self.sent))
                    self.sent = None
        elif tag == 'token':
            # map corenlp ner tags to coerse grained ner tags
            token = Token(self.word,
                          self.lemma,
                          self.pos,
                          ner=convert_corenlp_ner_tag(self.ner))
            self.sent.add_token(deepcopy(token))
            self.word = ''
            self.lemma = ''
            self.pos = ''
            self.ner = ''
        elif tag == 'dependencies':
            if self.parse_dep:
                self.parse_dep = False
        elif tag == 'dep':
            if self.parse_dep:
                if not self.copied_dep:
                    if self.dep_label != 'root':
                        dep = Dependency(self.dep_label, self.gov_idx,
                                         self.dep_idx, self.extra)
                        self.sent.add_dep(deepcopy(dep))
                else:
                    self.copied_dep = False
                self.dep_label = ''
                self.gov_idx = -1
                self.dep_idx = -1
                self.extra = False
        elif tag == 'coreference':
            if self.parse_coref:
                if self.coref is not None:
                    self.corefs.append(deepcopy(self.coref))
                    self.coref = None
                else:
                    self.parse_coref = False
        elif tag == 'mention':
            mention = Mention(self.sent_idx,
                              self.start_token_idx,
                              self.end_token_idx,
                              head_token_idx=self.head_token_idx,
                              rep=self.rep,
                              text=self.text.encode('ascii', 'ignore'))
            self.coref.add_mention(deepcopy(mention))
            self.sent_idx = -1
            self.start_token_idx = -1
            self.end_token_idx = -1
            self.head_token_idx = -1
            self.rep = False
            self.text = ''

    def close(self):
        sents, self.sents = self.sents, []
        corefs, self.corefs = self.corefs, []
        return sents, corefs