def match_t_and_s(elements): """Check whether every element in elements has matching <t> and <s> tags. """ elements_ok = True error_msg = [] for elem in elements: t_found = False s_found = False for child in elem.children: if text_content(child): t_found = True if sentence(child): s_found = True if not t_found and s_found: elements_ok = False error_msg.append('<t> and <s> mismatch: {id_}' \ .format(id_=elem.get('xml:id'))) return elements_ok, '\n'.join(error_msg)
def event2es(event_xml, event_order, es, index_name, type_name): events = event_xml.find_all('event') event = events[0] event_id = event.attrs.get('xml:id') if not es.exists(index=index_name, doc_type=type_name, id=event_id): play_id = xml_id2play_id(event_id) cls = event.attrs.get('class') if cls == 'speakerturn': actor = extract_character_name(event.attrs.get('actor')) text = [] for elem in event.descendants: if sentence(elem) and not note(elem.parent): text.append(elem.t.string) num_words = 0 text_ascii = ' '.join(text).encode('ascii', 'ignore') # prevent empty string to be send to the analyzer if text_ascii and not text_ascii.isspace(): ws = es.indices.analyze(index=index_name, body=text_ascii, analyzer='standard').get('tokens') num_words = len(ws) doc = { 'event_id': event_id, 'text_id': play_id, 'event_class': cls, 'order': event_order, 'text': ' '.join(text), 'num_words': num_words } if cls == 'speakerturn': doc['actor'] = actor # create document if it does not yet exist es.create(index_name, type_name, doc)
def act2text(act_xml): """Extract text from act. Returns a string that can be written to file. """ text = [] print 'act:', act_xml.find('div', 'act').attrs.get('xml:id') subacts = act_xml.find_all(act) # act_xml should contain exactly one act; if it contains more acts, these # acts are sub acts, that will be processed later if len(subacts) == 1: for elem in act_xml.descendants: if sentence(elem) and not note(elem.parent): # some t elements appear to be empty (this is not allowed, but # it happens). So, check whether there is a string to add # before adding it. if elem.t: if elem.t.string: text.append(elem.t.string) return text