def to_xml(self, value, param_name): wrapper = Element(param_name) for _dict in value: wrapper.append(self.get_xml_dict(_dict, 'dict')) return wrapper
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: definition_list = Element('definition_list') definitions = def_jms_wmq_list(session, params['cluster_id']) for definition in definitions: definition_elem = Element('definition') definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.host = definition.host definition_elem.port = definition.port definition_elem.queue_manager = definition.queue_manager definition_elem.channel = definition.channel definition_elem.cache_open_send_queues = definition.cache_open_send_queues definition_elem.cache_open_receive_queues = definition.cache_open_receive_queues definition_elem.use_shared_connections = definition.use_shared_connections definition_elem.ssl = definition.ssl definition_elem.ssl_cipher_spec = definition.ssl_cipher_spec definition_elem.ssl_key_repository = definition.ssl_key_repository definition_elem.needs_mcd = definition.needs_mcd definition_elem.max_chars_printed = definition.max_chars_printed definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def handle(self, *args, **kwargs): with closing(self.server.odb.session()) as session: params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') definition_list = Element('definition_list') definitions = job_list(session, params['cluster_id']) for definition in definitions: definition_elem = Element('definition') definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.is_active = definition.is_active definition_elem.job_type = definition.job_type definition_elem.start_date = definition.start_date definition_elem.extra = definition.extra.decode('utf-8') definition_elem.service_id = definition.service_id definition_elem.service_name = definition.service_name.decode('utf-8') definition_elem.weeks = definition.weeks if definition.weeks else '' definition_elem.days = definition.days if definition.days else '' definition_elem.hours = definition.hours if definition.hours else '' definition_elem.minutes = definition.minutes if definition.minutes else '' definition_elem.seconds = definition.seconds if definition.seconds else '' definition_elem.repeats = definition.repeats if definition.repeats else '' definition_elem.cron_definition = (definition.cron_definition.decode('utf-8') if definition.cron_definition else '') definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def to_xml(self, value, param_name): wrapper = Element(param_name) for item_value in value: xml_item = Element('item') wrapper.append(xml_item) wrapper.item[-1] = item_value return wrapper
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = out_amqp_list(session, params['cluster_id']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.delivery_mode = db_item.delivery_mode item.priority = db_item.priority item.content_type = db_item.content_type item.content_encoding = db_item.content_encoding item.expiration = db_item.expiration item.user_id = db_item.user_id item.app_id = db_item.app_id item.def_name = db_item.def_name item.def_id = db_item.def_id item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id', 'connection', 'transport'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = http_soap_list(session, params['cluster_id'], params['connection'], params['transport']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.is_internal = db_item.is_internal item.url_path = db_item.url_path item.method = db_item.method item.soap_action = db_item.soap_action item.soap_version = db_item.soap_version item.service_id = db_item.service_id item.service_name = db_item.service_name item.security_id = db_item.security_id item.security_name = db_item.security_name item.security_def_type = db_item.security_def_type item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def convert(self, param, param_name, value, has_simple_io_config, is_xml, date_time_format=None): try: if any(param_name.startswith(prefix) for prefix in self.bool_parameter_prefixes) or isinstance(param, Boolean): value = asbool(value or None) # value can be an empty string and asbool chokes on that if value and value is not None: # Can be a 0 if isinstance(param, Boolean): value = asbool(value) elif isinstance(param, CSV): value = value.split(',') elif isinstance(param, List): if is_xml: # We are parsing XML to create a SIO request if isinstance(value, EtreeElement): return [elem.text for elem in value.getchildren()] # We are producing XML out of an SIO response else: wrapper = Element(param_name) for item_value in value: xml_item = Element('item') wrapper.append(xml_item) wrapper.item[-1] = item_value return wrapper # This is a JSON list return value elif isinstance(param, Integer): value = int(value) elif isinstance(param, Unicode): value = unicode(value) elif isinstance(param, UTC): value = value.replace('+00:00', '') else: if value and value != ZATO_NONE and has_simple_io_config: if any(param_name==elem for elem in self.int_parameters) or \ any(param_name.endswith(suffix) for suffix in self.int_parameter_suffixes): value = int(value) if date_time_format and isinstance(value, datetime): value = value.strftime(date_time_format) if isinstance(param, CSV) and not value: value = [] return value except Exception, e: msg = 'Conversion error, param:[{}], param_name:[{}], repr(value):[{}], e:[{}]'.format( param, param_name, repr(value), format_exc(e)) logger.error(msg) raise ZatoException(msg=msg)
def handle(self, *args, **kwargs): with closing(self.server.odb.session()) as session: definition_list = Element("definition_list") params = _get_params(kwargs.get("payload"), ["cluster_id"], "data.") definitions = tech_acc_list(session, params["cluster_id"]) for definition in definitions: definition_elem = Element("definition") definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.is_active = definition.is_active definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def get_xml_dict(self, _dict, name): xml_dict = Element(name) for k, v in _dict.items(): xml_item = Element('item') key = Element('key') value = Element('value') xml_item.key = key xml_item.value = value xml_item.key[-1] = k xml_item.value[-1] = v xml_dict.append(xml_item) return xml_dict
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: definition_list = Element('definition_list') definitions = basic_auth_list(session, params['cluster_id']) for definition in definitions: definition_elem = Element('definition') definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.is_active = definition.is_active definition_elem.username = definition.username definition_elem.domain = definition.domain definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = out_s3_list(session, params['cluster_id']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.prefix_ = db_item.prefix item.separator = db_item.separator item.key_sync_timeout = db_item.key_sync_timeout item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = service_list(session, params['cluster_id']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.impl_name = db_item.impl_name item.is_internal = db_item.is_internal item.usage_count = 'TODO getlist' item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = channel_zmq_list(session, params['cluster_id']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.address = db_item.address item.socket_type = db_item.socket_type item.sub_key = db_item.sub_key item.service_name = db_item.service_name item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = channel_amqp_list(session, params['cluster_id']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.queue = db_item.queue item.consumer_tag_prefix = db_item.consumer_tag_prefix item.service_name = db_item.service_name item.def_name = db_item.def_name item.def_id = db_item.def_id item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def handle(self, *args, **kwargs): with closing(self.server.odb.session()) as session: params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') definition_list = Element('definition_list') pairs = (('basic_auth', basic_auth_list), ('tech_acc', tech_acc_list), ('wss_username_password', wss_list)) for def_type, meth in pairs: definitions = meth(session, params['cluster_id']) for definition in definitions: definition_elem = Element('definition') definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.def_type = def_type definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def handle(self, *args, **kwargs): with closing(self.server.odb.session()) as session: params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') definition_list = Element('definition_list') definitions = wss_list(session, params['cluster_id']) for definition in definitions: definition_elem = Element('definition') definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.is_active = definition.is_active definition_elem.password_type = definition.password_type definition_elem.username = definition.username definition_elem.reject_empty_nonce_ts = definition.reject_empty_nonce_ts definition_elem.reject_stale_username = definition.reject_stale_username definition_elem.expiry_limit = definition.expiry_limit definition_elem.nonce_freshness = definition.nonce_freshness definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: definition_list = Element('definition_list') definitions = def_amqp_list(session, params['cluster_id']) for definition in definitions: definition_elem = Element('definition') definition_elem.id = definition.id definition_elem.name = definition.name definition_elem.host = definition.host definition_elem.port = definition.port definition_elem.vhost = definition.vhost definition_elem.username = definition.username definition_elem.frame_max = definition.frame_max definition_elem.heartbeat = definition.heartbeat definition_list.append(definition_elem) return ZATO_OK, etree.tostring(definition_list)
def handle(self, *args, **kwargs): params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.') with closing(self.server.odb.session()) as session: item_list = Element('item_list') db_items = out_ftp_list(session, params['cluster_id']) for db_item in db_items: item = Element('item') item.id = db_item.id item.name = db_item.name item.is_active = db_item.is_active item.host = db_item.host item.port = db_item.port item.user = db_item.user item.acct = db_item.acct item.timeout = db_item.timeout item.dircache = db_item.dircache item_list.append(item) return ZATO_OK, etree.tostring(item_list)
def getvalue(self, serialize=True): """ Gets the actual payload's value converted to a string representing either XML or JSON. """ if self.zato_is_xml: if self.zato_output_repeated: value = Element('item_list') else: value = Element('item') else: if self.zato_output_repeated: value = [] else: value = {} if self.zato_output_repeated: output = self.zato_output else: output = set(dir(self)) & self.zato_all_attrs output = [dict((name, getattr(self, name)) for name in output)] if output: # All elements must be of the same type so it's OK to do it is_sa_namedtuple = isinstance(output[0], KeyedTuple) for item in output: if self.zato_is_xml: out_item = Element('item') else: out_item = {} for is_required, name in chain(self.zato_required, self.zato_optional): leave_as_is = isinstance(name, AsIs) elem_value = self._getvalue(name, item, is_sa_namedtuple, is_required, leave_as_is) if isinstance(name, ForceType): name = name.name if isinstance(elem_value, basestring): elem_value = elem_value if isinstance( elem_value, unicode) else elem_value.decode('utf-8') if self.zato_is_xml: setattr(out_item, name, elem_value) else: out_item[name] = elem_value if self.zato_output_repeated: value.append(out_item) else: value = out_item if self.zato_is_xml: em = ElementMaker(annotate=False, namespace=self.namespace, nsmap={None: self.namespace}) zato_env = em.zato_env(em.cid(self.zato_cid), em.result(ZATO_OK)) top = getattr(em, self.response_elem)(zato_env) top.append(value) else: top = {self.response_elem: value} search = self.zato_meta.get('search') if search: top['_meta'] = search if serialize: if self.zato_is_xml: deannotate(top, cleanup_namespaces=True) return etree.tostring(top) else: return dumps(top) else: return top
def getvalue(self, serialize=True): """ Gets the actual payload's value converted to a string representing either XML or JSON. """ if self.zato_is_xml: if self.zato_output_repeated: value = Element('item_list') else: value = Element('item') else: if self.zato_output_repeated: value = [] else: value = {} if self.zato_output_repeated: output = self.zato_output else: output = set(dir(self)) & self.zato_all_attrs output = [dict((name, getattr(self, name)) for name in output)] if output: # All elements must be of the same type so it's OK to do it is_sa_namedtuple = isinstance(output[0], KeyedTuple) for item in output: if self.zato_is_xml: out_item = Element('item') else: out_item = {} for is_required, name in chain(self.zato_required, self.zato_optional): leave_as_is = isinstance(name, AsIs) elem_value = self._getvalue(name, item, is_sa_namedtuple, is_required, leave_as_is) if isinstance(name, ForceType): name = name.name if isinstance(elem_value, basestring): elem_value = elem_value if isinstance(elem_value, unicode) else elem_value.decode('utf-8') if self.zato_is_xml: setattr(out_item, name, elem_value) else: out_item[name] = elem_value if self.zato_output_repeated: value.append(out_item) else: value = out_item if self.zato_is_xml: em = ElementMaker(annotate=False, namespace=self.namespace, nsmap={None:self.namespace}) zato_env = em.zato_env(em.cid(self.zato_cid), em.result(ZATO_OK)) top = getattr(em, self.response_elem)(zato_env) top.append(value) else: top = {self.response_elem: value} if serialize: if self.zato_is_xml: deannotate(top, cleanup_namespaces=True) return etree.tostring(top) else: return dumps(top) else: return top
def convert(self, param, param_name, value, has_simple_io_config, is_xml, date_time_format=None): try: if any( param_name.startswith(prefix) for prefix in self.bool_parameter_prefixes) or isinstance( param, Boolean): value = asbool( value or None ) # value can be an empty string and asbool chokes on that if value and value is not None: # Can be a 0 if isinstance(param, Boolean): value = asbool(value) elif isinstance(param, CSV): value = value.split(',') elif isinstance(param, List): if is_xml: # We are parsing XML to create a SIO request if isinstance(value, EtreeElement): return [elem.text for elem in value.getchildren()] # We are producing XML out of an SIO response else: wrapper = Element(param_name) for item_value in value: xml_item = Element('item') wrapper.append(xml_item) wrapper.item[-1] = item_value return wrapper # This is a JSON list return value elif isinstance(param, Integer): value = int(value) elif isinstance(param, Unicode): value = unicode(value) elif isinstance(param, UTC): value = value.replace('+00:00', '') else: if value and value != ZATO_NONE and has_simple_io_config: if any(param_name==elem for elem in self.int_parameters) or \ any(param_name.endswith(suffix) for suffix in self.int_parameter_suffixes): value = int(value) if date_time_format and isinstance(value, datetime): value = value.strftime(date_time_format) if isinstance(param, CSV) and not value: value = [] return value except Exception, e: msg = 'Conversion error, param:[{}], param_name:[{}], repr(value):[{}], e:[{}]'.format( param, param_name, repr(value), format_exc(e)) logger.error(msg) raise ZatoException(msg=msg)
class Corpus(XMLBase): def __init__(self, xml_input=None, annotations=None): super().__init__("corpus", "document") self.corpus = Element("corpus") self.url_indices = [] self.has_terms_locations = False self.nlp = stanza.Pipeline("en", processors={ "tokenize": "gum", "ner": "default", "lemma": "gum", "pos": "gum", "depparse": "gum" }, verbose=False, tokenize_no_ssplit=True) self.annotations = annotations.documents if annotations else None if xml_input: if xml_input and not os.path.exists(xml_input): raise FileNotFoundError( f"{xml_input} not found. Check the path again.") elif os.path.isfile(xml_input): self.read_from_xml(xml_input) else: self.read_from_folder(xml_input) @staticmethod def unicodify(text): return text.replace("“", "\"")\ .replace("”", "\"")\ .replace("’", "'")\ .replace("‘", "'") \ .replace("\n", " ") def add_document(self, url, title, categories, published_time, content, author=None, topics=None, links=None, terms=None, document_id=None): if url is None or len(url) == 0: raise KeyError("'url' is mandatory") elif url in self.url_indices: log.info(f"Ignoring duplicate URL={url}") return new_document = Element("document") title = Corpus.unicodify(title) new_document.document_id = md5(title.encode("utf-8")).hexdigest()[-6:] if document_id is None or \ len(document_id) == 0 else document_id new_document.url = url new_document.title = title new_document.author = author new_document.published_time = published_time # handle lists new_document.categories = Element("categories") if categories: new_document.categories.category = categories new_document.topics = Element("topics") if topics: new_document.topics.topic = topics new_document.links = Element("links") if links: new_document.links.link = links new_document.content = Element("content") if content: new_document.content.p = [ Corpus.unicodify(p) for p in content if p ] # handle terms new_document.terms = Element("terms") terms_list = [] if terms: for term in terms: term_elmt = Element("term") term_elmt.word = term term_elmt.locations = Element("locations") locations_list = [] for location in terms[term]: location_elmt = Element("location") location_elmt.begin, location_elmt.end = location locations_list.append(location_elmt) term_elmt.locations.location = locations_list terms_list.append(term_elmt) new_document.terms.term = terms_list self.corpus.append(new_document) self.url_indices.append(url) def add_document_from_element(self, document_elmt): # construct terms terms_list = {} if document_elmt.terms.countchildren() > 0: for term in document_elmt.terms.term: if term.locations.countchildren() > 0: terms_list[term.word.text] = [ (loc.begin.text, loc.end.text) for loc in term.locations.location ] self.add_document( document_elmt.url.text, document_elmt.title.text, [category.text for category in document_elmt.categories.category] if document_elmt.categories.countchildren() > 0 else None, document_elmt.published_time.text, [p.text for p in document_elmt.content.p] if document_elmt.content.countchildren() > 0 else None, document_elmt.author.text, [topic.text for topic in document_elmt.topics.topic] if document_elmt.topics.countchildren() > 0 else None, [link.text for link in document_elmt.links.link] if document_elmt.links.countchildren() > 0 else None, terms_list if len(terms_list) > 0 else None, document_elmt.document_id, ) def filter_empty(self): empty_document_list = [] for document in self.iter_documents(): if document.content.countchildren() == 0: empty_document_list.append(document) for document in empty_document_list: self.get_root().remove(document) return self def read_from_xml(self, input_path): composites = ["terms", "topics", "content", "links", "categories"] corpus_etree = etree.parse(input_path) corpus_root = corpus_etree.getroot() for document in corpus_root: new_document_attrs = {} annotated_terms = {} contain_terms_elmt = False for document_elmt in document: if document_elmt.tag == "category": new_document_attrs[ "categories"] = document_elmt.text.split( ";") if document_elmt.text else [] elif document_elmt.tag == "terms": # the document has existing annotations for term_elmt in document_elmt: word = None locations = [] for item_elmt in term_elmt: if item_elmt.tag == "word": word = item_elmt.text elif item_elmt.tag == "locations": begin, end = None, None for loc_elmt in item_elmt: for point_elmt in loc_elmt: if point_elmt.tag == "begin": begin = int(point_elmt.text) elif point_elmt.tag == "end": end = int(point_elmt.text) locations.append((begin, end)) annotated_terms[word] = locations contain_terms_elmt = True elif document_elmt.tag in composites: new_document_attrs[document_elmt.tag] = [ item.text for item in document_elmt ] else: new_document_attrs[document_elmt.tag] = document_elmt.text if self.annotations and new_document_attrs[ "document_id"] in self.annotations: # annotation file new_document_attrs["terms"] = self.annotations[ new_document_attrs["document_id"]] self.add_document(**new_document_attrs) self.has_terms_locations = True # at least 1 with terms elif contain_terms_elmt: # there is no annotation file but terms element exist new_document_attrs["terms"] = annotated_terms self.add_document(**new_document_attrs) self.has_terms_locations = True elif self.annotations is None: # there is no annotation file and no terms element self.add_document(**new_document_attrs) def read_from_folder(self, root_folder): in_folders = [ folder for folder in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, folder)) ] for in_folder in in_folders: xml_files = [ f for f in os.listdir(os.path.join(root_folder, in_folder)) if f.endswith(".xml") ] for xml_file in xml_files: self.read_from_xml( os.path.join(root_folder, in_folder, xml_file)) def get_document_ids(self): return [document.document_id for document in self.iter_documents()] def get_sample(self, n, excluded_ids=None): sample_corpus = Corpus() indices = list(range(len(self))) random.shuffle(indices) acquired_count = 0 i = 0 while acquired_count < n and i < len(indices): document = self[indices[i]] i += 1 document_id = document.document_id.text if excluded_ids and document_id in excluded_ids: continue sample_corpus.add_document_from_element(document) acquired_count += 1 return sample_corpus def get_more_sample(self, n, json1_filename): existing_ids = [] with open(json1_filename, "r") as json1_file: lines = json1_file.readlines() for line in lines: json_news = json.loads(line) current_id = md5(json_news["text"].split("|")[0].encode( "utf-8")).hexdigest()[-6:] existing_ids.append(current_id) return self.get_sample(n, existing_ids) def get_documents_by_ids(self, ids): subset_corpus = Corpus() for document in self: if document.document_id in ids: subset_corpus.add_document_from_element(document) return subset_corpus def get_documents_by_urls(self, urls): subset_corpus = Corpus() for document in self: if document.url.text in urls: subset_corpus.add_document_from_element(document) return subset_corpus def get_annotated_terms_as_csv(self, csv_path): with open(csv_path, "w") as csv_file: fieldnames = ["document_id", "terms"] csv_writer = DictWriter(csv_file, fieldnames) csv_writer.writeheader() for doc in self.iter_documents(): document_id = doc.document_id.text all_terms = [term.word.text.lower() for term in doc.terms.term] csv_writer.writerow({ "document_id": document_id, "terms": "|".join(all_terms) }) return True def train_test_split(self, test_size, random_seed=1337): dev_c = Corpus() test_c = Corpus() n = len(self) * test_size indices = list(range(len(self))) random.seed(random_seed) random.shuffle(indices) i = 0 while i < len(indices): document = self[indices[i]] if i < n: dev_c.add_document_from_element(document) else: test_c.add_document_from_element(document) i += 1 return dev_c, test_c def annotate_sentence(self, sentence, buffer_offset, term_locs=None): term_state = ["O", "B-TERM", "I-TERM"] annotated_text = self.nlp(sentence) annotated_sentences = [] head_dict = {0: "root"} for sentence in annotated_text.sentences: annotated_sentence = [] for token in sentence.tokens: if len(token.words) > 1: log.info(token) else: word = token.words[0] misc = dict( token_misc.split("=") for token_misc in word.misc.split("|")) word_id = int(word.id) head_dict[word_id] = word.text start_char = buffer_offset + int(misc["start_char"]) end_char = buffer_offset + int(misc["end_char"]) annotations = { "id": word_id, "word": word.text, "pos": word.xpos, "lemma": word.lemma, "deprel": word.deprel, "deprel_head_id": word.head, "character_offset_begin": start_char, "character_offset_end": end_char, "ner": token.ner } if term_locs is not None and len(term_locs) > 0: annotations["term_tag"] = term_state[bisect( term_locs, start_char) % 3] annotated_sentence.append(annotations) for i, token in enumerate(annotated_sentence): token["deprel_head_text"] = head_dict[token["deprel_head_id"]] if "term_tag" in token: # hacky way, should fix write_to_core_nlp_xmls insort usage # if token["term_tag"][0] == "I" and (i == 0 or annotated_sentence[i-1]["term_tag"][0] == "O"): # if i == len(annotated_sentence) - 1 or annotated_sentence[i+1]["term_tag"][0] != "I": # token["term_tag"] = "S" + token["term_tag"][1:] # else: # token["term_tag"] = "B" + token["term_tag"][1:] # el if i == len(annotated_sentence) - 1 or annotated_sentence[ i + 1]["term_tag"][0] != "I": if token["term_tag"][0] == "B": token["term_tag"] = "S" + token["term_tag"][1:] elif token["term_tag"][0] == "I": token["term_tag"] = "E" + token["term_tag"][1:] annotated_sentences.append(annotated_sentence) return annotated_sentences def write_to_core_nlp_xmls(self, output_folder): for document in tqdm(self.iter_documents(), total=len(self)): document_id = document.document_id.text if f"{document_id}.xml" not in os.listdir(output_folder): buffer_offset = 0 title = document.title.text term_locs = [] if self.has_terms_locations: for term in document.terms.term: for location in term.locations.location: insort(term_locs, int(location.begin.text) - 0.5) insort(term_locs, int(location.begin.text) + 0.5) insort(term_locs, int(location.end.text)) annotated_title = self.annotate_sentence( title, buffer_offset, term_locs) buffer_offset += len(title) + 1 annotated_content = [] for p in document.content.p: if len(p.text.strip()) > 0: text = p.text.strip() p_sents = nltk.tokenize.sent_tokenize(text) for p_sent in p_sents: annotated_content += self.annotate_sentence( p_sent, buffer_offset, term_locs) buffer_offset += len(p_sent) + 1 core_nlp_document = StanfordCoreNLPDocument() core_nlp_document.from_sentences(annotated_title, annotated_content) core_nlp_document.write_xml_to( os.path.join(output_folder, f"{document_id}.xml")) def write_to_jsonl(self, jsonl_path): # terms_found = False with open(jsonl_path, "w") as out_file: for document in self.iter_documents(): # if document.terms.countchildren() > 0: # labels = [] # for term in document.terms.term: # for location in term.locations.location: # labels.append([int(location.begin.text), int(location.end.text), "UNK"]) doc_id = document.document_id.text text = { "text": "|".join([document.title.text] + [p.text for p in document.content.p]), "meta": { "doc_id": doc_id } } json.dump(html.unescape(text), out_file) out_file.write("\n")
def getvalue(self, serialize=True): """ Gets the actual payload's value converted to a string representing either XML or JSON. """ if self.zato_is_xml: if self.zato_is_repeated: value = Element('item_list') else: value = Element('item') else: if self.zato_is_repeated: value = [] else: value = {} if self.zato_is_repeated: output = self.zato_output else: output = set(dir(self)) & self.zato_all_attrs output = [dict((name, getattr(self, name)) for name in output)] if output: # All elements must be of the same type so it's OK to do it is_sa_namedtuple = isinstance(output[0], NamedTuple) for item in output: if self.zato_is_xml: out_item = Element('item') else: out_item = {} for is_required, name in chain(self.zato_required, self.zato_optional): leave_as_is = isinstance(name, AsIs) if isinstance(name, ForceType): name = name.name elem_value = self._getvalue(name, item, is_sa_namedtuple, is_required, leave_as_is) if self.zato_is_xml: setattr(out_item, name, elem_value) else: out_item[name] = elem_value if self.zato_is_repeated: value.append(out_item) else: value = out_item if self.zato_is_xml: top = Element(self.response_elem) top.append(value) else: top = {self.response_elem: value} if serialize: if self.zato_is_xml: return etree.tostring(top) else: return dumps(top) else: return top