def load_data_and_labels_lemonde(filepathXml): """ Load data and label from Le Monde XML corpus file the format is ENAMEX-style, as follow: <sentence id="E14">Les ventes de micro-ordinateurs en <ENAMEX type="Location" sub_type="Country" eid="2000000003017382" name="Republic of France">France</ENAMEX> se sont ralenties en 1991. </sentence> Returns: tuple(numpy array, numpy array): data and labels """ # as we have XML mixed content, we need a real XML parser... parser = make_parser() handler = ENAMEXContentHandler() parser.setContentHandler(handler) parser.parse(filepathXml) tokens = handler.getSents() labels = handler.getAllLabels() return tokens, labels
def load_data_and_labels_xml_file(filepathXml): """ Load data and label from an XML file the format is as follow: <p> bla bla you are a <rs type="insult">CENSURED</rs>, and I will <rs type="threat">find and kill</rs> you bla bla </p> only the insulting expression is labelled, and similarly only the threat "action" is tagged Returns: tuple(numpy array, numpy array): data and labels """ # as we have XML mixed content, we need a real XML parser... parser = make_parser() handler = TEIContentHandler() parser.setContentHandler(handler) parser.parse(filepathXml) tokens = handler.getSents() labels = handler.getAllLabels() return tokens, labels