def from_doc(cls, doc): if not isinstance(doc, document.Document): raise ParseScriptError( 'from_doc must be called with a {} instance'.format( get_class_name(document.Document))) # get all events from document events = [] # iterate through all sentences for sent in doc.sents: # iterate through all tokens for pred_token in sent.tokens: if pred_token.pos.startswith('VB'): # exclude "be" verbs if pred_token.lemma == 'be': continue # NOBUG: do not exclude stop verbs now, as both negation and # particle need to be counted in detecting a stop verb, # which should be excluded in constructing RichScript # TODO: exclude verbs in quotes # exclude modifying verbs if sent.dep_graph.lookup_label('gov', pred_token.token_idx, 'xcomp'): continue neg = False if sent.dep_graph.lookup_label('gov', pred_token.token_idx, 'neg'): neg = True subj_list = sent.get_subj_list(pred_token.token_idx) obj_list = sent.get_obj_list(pred_token.token_idx) pobj_list = sent.get_pobj_list(pred_token.token_idx) if (not subj_list) and (not obj_list): continue if not subj_list: subj_list.append(None) if not obj_list: obj_list.append(None) for arg_tuple in product(subj_list, obj_list): events.append( Event.from_tokens(pred_token, neg, arg_tuple[0], arg_tuple[1], pobj_list)) if not events: warn('doc {} has no events'.format(doc.doc_name)) if not doc.corefs: warn('doc {} has no corefs'.format(doc.doc_name)) # get all entities from document entities = [Entity.from_coref(coref) for coref in doc.corefs] return cls(doc.doc_name, entities, events)
def from_doc(cls, doc): check_type(doc, corenlp.Document) script = cls(doc.doc_name) # add all entities from document for coref in doc.corefs: entity = Entity.from_coref(coref) script.add_entity(entity) if not script.has_entities(): log.warn('script {} has no entities'.format(doc.doc_name)) # add all events from document for sent in doc.sents: # iterate through all tokens for pred_token in sent.tokens: if pred_token.pos.startswith('VB'): # exclude "be" verbs if pred_token.lemma == 'be': continue # exclude modifying verbs if sent.dep_graph.lookup_label('head', pred_token.token_idx, 'xcomp'): continue # TODO: exclude verbs in quotes # NOBUG: do not exclude stop verbs now # both negation and particle need to be counted in # detecting a stop verb, we will remove stop verbs # in constructing RichScript # find whether the verb has negation neg = False if sent.dep_graph.lookup_label('head', pred_token.token_idx, 'neg'): neg = True # find whether the verb has particle prt = '' prt_tokens = sent.lookup_label('head', pred_token.token_idx, 'compound:prt') if prt_tokens: if len(prt_tokens) > 1: log.warn( 'Predicate {} contains {} particles'.format( pred_token.pretty_print(), len(prt_tokens))) prt = prt_tokens[0].lemma subj_list = sent.get_subj_list(pred_token.token_idx) dobj_list = sent.get_dobj_list(pred_token.token_idx) pobj_list = sent.get_pobj_list(pred_token.token_idx) if (not subj_list) and (not dobj_list): continue if not subj_list: subj_list.append(None) if not dobj_list: dobj_list.append(None) for arg_tuple in product(subj_list, dobj_list): event = Event.from_tokens(pred_token, arg_tuple[0], arg_tuple[1], pobj_list, neg=neg, prt=prt) script.add_event(event) if not script.has_events(): log.warn('script {} has no events'.format(doc.doc_name)) return script