Esempio n. 1
0
    def to_xml(self, value, param_name):
        wrapper = Element(param_name)

        for _dict in value:
            wrapper.append(self.get_xml_dict(_dict, 'dict'))

        return wrapper
Esempio n. 2
0
 def handle(self, *args, **kwargs):
     
     params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
     
     with closing(self.server.odb.session()) as session:
         definition_list = Element('definition_list')
         definitions = def_jms_wmq_list(session, params['cluster_id'])
 
         for definition in definitions:
 
             definition_elem = Element('definition')
             definition_elem.id = definition.id
             definition_elem.name = definition.name
             definition_elem.host = definition.host
             definition_elem.port = definition.port
             definition_elem.queue_manager = definition.queue_manager
             definition_elem.channel = definition.channel
             definition_elem.cache_open_send_queues = definition.cache_open_send_queues
             definition_elem.cache_open_receive_queues = definition.cache_open_receive_queues
             definition_elem.use_shared_connections = definition.use_shared_connections
             definition_elem.ssl = definition.ssl
             definition_elem.ssl_cipher_spec = definition.ssl_cipher_spec
             definition_elem.ssl_key_repository = definition.ssl_key_repository
             definition_elem.needs_mcd = definition.needs_mcd
             definition_elem.max_chars_printed = definition.max_chars_printed
 
             definition_list.append(definition_elem)
 
         return ZATO_OK, etree.tostring(definition_list)
Esempio n. 3
0
 def handle(self, *args, **kwargs):
     
     with closing(self.server.odb.session()) as session:
         
         params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
         definition_list = Element('definition_list')
         definitions = job_list(session, params['cluster_id'])
         
         for definition in definitions:
 
             definition_elem = Element('definition')
             definition_elem.id = definition.id
             definition_elem.name = definition.name
             definition_elem.is_active = definition.is_active
             definition_elem.job_type = definition.job_type
             definition_elem.start_date = definition.start_date
             definition_elem.extra = definition.extra.decode('utf-8')
             definition_elem.service_id = definition.service_id
             definition_elem.service_name = definition.service_name.decode('utf-8')
             definition_elem.weeks = definition.weeks if definition.weeks else ''
             definition_elem.days = definition.days if definition.days else ''
             definition_elem.hours = definition.hours if definition.hours else ''
             definition_elem.minutes = definition.minutes if definition.minutes else ''
             definition_elem.seconds = definition.seconds if definition.seconds else ''
             definition_elem.repeats = definition.repeats if definition.repeats else ''
             definition_elem.cron_definition = (definition.cron_definition.decode('utf-8') if 
                 definition.cron_definition else '')
             
             definition_list.append(definition_elem)
 
         return ZATO_OK, etree.tostring(definition_list)
Esempio n. 4
0
 def to_xml(self, value, param_name):
     wrapper = Element(param_name)
     for item_value in value:
         xml_item = Element('item')
         wrapper.append(xml_item)
         wrapper.item[-1] = item_value
     return wrapper
Esempio n. 5
0
 def to_xml(self, value, param_name):
     wrapper = Element(param_name)
     for item_value in value:
         xml_item = Element('item')
         wrapper.append(xml_item)
         wrapper.item[-1] = item_value
     return wrapper
Esempio n. 6
0
    def to_xml(self, value, param_name):
        wrapper = Element(param_name)

        for _dict in value:
            wrapper.append(self.get_xml_dict(_dict, 'dict'))

        return wrapper
Esempio n. 7
0
File: amqp.py Progetto: brtsz/zato
 def handle(self, *args, **kwargs):
     
     params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
     
     with closing(self.server.odb.session()) as session:
         item_list = Element('item_list')
         db_items = out_amqp_list(session, params['cluster_id'])
 
         for db_item in db_items:
 
             item = Element('item')
             item.id = db_item.id
             item.name = db_item.name
             item.is_active = db_item.is_active
             item.delivery_mode = db_item.delivery_mode
             item.priority = db_item.priority
             item.content_type = db_item.content_type
             item.content_encoding = db_item.content_encoding
             item.expiration = db_item.expiration
             item.user_id = db_item.user_id
             item.app_id = db_item.app_id
             item.def_name = db_item.def_name
             item.def_id = db_item.def_id
 
             item_list.append(item)
 
         return ZATO_OK, etree.tostring(item_list)
Esempio n. 8
0
    def handle(self, *args, **kwargs):

        params = _get_params(kwargs.get('payload'), ['cluster_id', 'connection', 'transport'], 'data.')

        with closing(self.server.odb.session()) as session:
            item_list = Element('item_list')
            db_items = http_soap_list(session, params['cluster_id'], params['connection'], params['transport'])
            
            for db_item in db_items:

                item = Element('item')
                item.id = db_item.id
                item.name = db_item.name
                item.is_active = db_item.is_active
                item.is_internal = db_item.is_internal
                item.url_path = db_item.url_path
                item.method = db_item.method
                item.soap_action = db_item.soap_action
                item.soap_version = db_item.soap_version
                item.service_id = db_item.service_id
                item.service_name = db_item.service_name
                item.security_id = db_item.security_id
                item.security_name = db_item.security_name
                item.security_def_type = db_item.security_def_type

                item_list.append(item)

            return ZATO_OK, etree.tostring(item_list)
Esempio n. 9
0
 def convert(self, param, param_name, value, has_simple_io_config, is_xml, date_time_format=None):
     try:
         if any(param_name.startswith(prefix) for prefix in self.bool_parameter_prefixes) or isinstance(param, Boolean):
             value = asbool(value or None) # value can be an empty string and asbool chokes on that
             
         if value and value is not None: # Can be a 0
             if isinstance(param, Boolean):
                 value = asbool(value)
                 
             elif isinstance(param, CSV):
                 value = value.split(',')
                 
             elif isinstance(param, List):
                 if is_xml:
                     # We are parsing XML to create a SIO request
                     if isinstance(value, EtreeElement):
                         return [elem.text for elem in value.getchildren()]
                     
                     # We are producing XML out of an SIO response
                     else:
                         wrapper = Element(param_name)
                         for item_value in value:
                             xml_item = Element('item')
                             wrapper.append(xml_item)
                             wrapper.item[-1] = item_value
                         return wrapper
                         
                 # This is a JSON list
                 return value
             
             elif isinstance(param, Integer):
                 value = int(value)
                 
             elif isinstance(param, Unicode):
                 value = unicode(value)
                 
             elif isinstance(param, UTC):
                 value = value.replace('+00:00', '')
                 
             else:
                 if value and value != ZATO_NONE and has_simple_io_config:
                     if any(param_name==elem for elem in self.int_parameters) or \
                        any(param_name.endswith(suffix) for suffix in self.int_parameter_suffixes):
                         value = int(value)
                         
             if date_time_format and isinstance(value, datetime):
                 value = value.strftime(date_time_format)
                 
         if isinstance(param, CSV) and not value:
             value = []
             
         return value
     except Exception, e:
         msg = 'Conversion error, param:[{}], param_name:[{}], repr(value):[{}], e:[{}]'.format(
             param, param_name, repr(value), format_exc(e))
         logger.error(msg)
         
         raise ZatoException(msg=msg)
Esempio n. 10
0
    def handle(self, *args, **kwargs):

        with closing(self.server.odb.session()) as session:
            definition_list = Element("definition_list")
            params = _get_params(kwargs.get("payload"), ["cluster_id"], "data.")

            definitions = tech_acc_list(session, params["cluster_id"])
            for definition in definitions:

                definition_elem = Element("definition")
                definition_elem.id = definition.id
                definition_elem.name = definition.name
                definition_elem.is_active = definition.is_active

                definition_list.append(definition_elem)

            return ZATO_OK, etree.tostring(definition_list)
Esempio n. 11
0
    def get_xml_dict(self, _dict, name):
        xml_dict = Element(name)

        for k, v in _dict.items():
            xml_item = Element('item')

            key = Element('key')
            value = Element('value')

            xml_item.key = key
            xml_item.value = value

            xml_item.key[-1] = k
            xml_item.value[-1] = v

            xml_dict.append(xml_item)

        return xml_dict
Esempio n. 12
0
    def get_xml_dict(self, _dict, name):
        xml_dict = Element(name)

        for k, v in _dict.items():
            xml_item = Element('item')

            key = Element('key')
            value = Element('value')

            xml_item.key = key
            xml_item.value = value

            xml_item.key[-1] = k
            xml_item.value[-1] = v

            xml_dict.append(xml_item)

        return xml_dict
Esempio n. 13
0
 def handle(self, *args, **kwargs):
     
     params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
     
     with closing(self.server.odb.session()) as session:
         definition_list = Element('definition_list')
         definitions = basic_auth_list(session, params['cluster_id'])
 
         for definition in definitions:
 
             definition_elem = Element('definition')
             definition_elem.id = definition.id
             definition_elem.name = definition.name
             definition_elem.is_active = definition.is_active
             definition_elem.username = definition.username
             definition_elem.domain = definition.domain
 
             definition_list.append(definition_elem)
 
         return ZATO_OK, etree.tostring(definition_list)
Esempio n. 14
0
File: s3.py Progetto: brtsz/zato
    def handle(self, *args, **kwargs):

        params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')

        with closing(self.server.odb.session()) as session:
            item_list = Element('item_list')
            db_items = out_s3_list(session, params['cluster_id'])

            for db_item in db_items:

                item = Element('item')
                item.id = db_item.id
                item.name = db_item.name
                item.is_active = db_item.is_active
                item.prefix_ = db_item.prefix
                item.separator = db_item.separator
                item.key_sync_timeout = db_item.key_sync_timeout

                item_list.append(item)

            return ZATO_OK, etree.tostring(item_list)
Esempio n. 15
0
 def handle(self, *args, **kwargs):
     
     params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
     
     with closing(self.server.odb.session()) as session:
         item_list = Element('item_list')
         db_items = service_list(session, params['cluster_id'])
         
         for db_item in db_items:
 
             item = Element('item')
             item.id = db_item.id
             item.name = db_item.name
             item.is_active = db_item.is_active
             item.impl_name = db_item.impl_name
             item.is_internal = db_item.is_internal
             item.usage_count = 'TODO getlist'
 
             item_list.append(item)
 
         return ZATO_OK, etree.tostring(item_list)
Esempio n. 16
0
File: zmq.py Progetto: brtsz/zato
 def handle(self, *args, **kwargs):
     
     params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
     
     with closing(self.server.odb.session()) as session:
         item_list = Element('item_list')
         db_items = channel_zmq_list(session, params['cluster_id'])
         
         for db_item in db_items:
 
             item = Element('item')
             item.id = db_item.id
             item.name = db_item.name
             item.is_active = db_item.is_active
             item.address = db_item.address
             item.socket_type = db_item.socket_type
             item.sub_key = db_item.sub_key
             item.service_name = db_item.service_name
 
             item_list.append(item)
 
         return ZATO_OK, etree.tostring(item_list)
Esempio n. 17
0
File: amqp.py Progetto: brtsz/zato
    def handle(self, *args, **kwargs):
        
        params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
        
        with closing(self.server.odb.session()) as session:
            item_list = Element('item_list')
            db_items = channel_amqp_list(session, params['cluster_id'])
    
            for db_item in db_items:
    
                item = Element('item')
                item.id = db_item.id
                item.name = db_item.name
                item.is_active = db_item.is_active
                item.queue = db_item.queue
                item.consumer_tag_prefix = db_item.consumer_tag_prefix
                item.service_name = db_item.service_name
                item.def_name = db_item.def_name
                item.def_id = db_item.def_id
    
                item_list.append(item)

            return ZATO_OK, etree.tostring(item_list)
Esempio n. 18
0
    def handle(self, *args, **kwargs):
        
        with closing(self.server.odb.session()) as session:
            params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
            definition_list = Element('definition_list')

            pairs = (('basic_auth', basic_auth_list), 
                     ('tech_acc', tech_acc_list), 
                     ('wss_username_password', wss_list))
            
            for def_type, meth in pairs:
                
                definitions = meth(session, params['cluster_id'])
                for definition in definitions:
        
                    definition_elem = Element('definition')
                    definition_elem.id = definition.id
                    definition_elem.name = definition.name
                    definition_elem.def_type = def_type
        
                    definition_list.append(definition_elem)
    
            return ZATO_OK, etree.tostring(definition_list)
Esempio n. 19
0
File: wss.py Progetto: brtsz/zato
 def handle(self, *args, **kwargs):
     
     with closing(self.server.odb.session()) as session:
         params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
         definition_list = Element('definition_list')
         definitions = wss_list(session, params['cluster_id'])
 
         for definition in definitions:
 
             definition_elem = Element('definition')
             definition_elem.id = definition.id
             definition_elem.name = definition.name
             definition_elem.is_active = definition.is_active
             definition_elem.password_type = definition.password_type
             definition_elem.username = definition.username
             definition_elem.reject_empty_nonce_ts = definition.reject_empty_nonce_ts
             definition_elem.reject_stale_username = definition.reject_stale_username
             definition_elem.expiry_limit = definition.expiry_limit
             definition_elem.nonce_freshness = definition.nonce_freshness
 
             definition_list.append(definition_elem)
 
         return ZATO_OK, etree.tostring(definition_list)
Esempio n. 20
0
File: amqp.py Progetto: brtsz/zato
 def handle(self, *args, **kwargs):
     
     params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')
     
     with closing(self.server.odb.session()) as session:
         definition_list = Element('definition_list')
         definitions = def_amqp_list(session, params['cluster_id'])
 
         for definition in definitions:
 
             definition_elem = Element('definition')
             definition_elem.id = definition.id
             definition_elem.name = definition.name
             definition_elem.host = definition.host
             definition_elem.port = definition.port
             definition_elem.vhost = definition.vhost
             definition_elem.username = definition.username
             definition_elem.frame_max = definition.frame_max
             definition_elem.heartbeat = definition.heartbeat
 
             definition_list.append(definition_elem)
 
         return ZATO_OK, etree.tostring(definition_list)
Esempio n. 21
0
File: ftp.py Progetto: brtsz/zato
    def handle(self, *args, **kwargs):
        
        params = _get_params(kwargs.get('payload'), ['cluster_id'], 'data.')

        with closing(self.server.odb.session()) as session:
            item_list = Element('item_list')
            db_items = out_ftp_list(session, params['cluster_id'])

            for db_item in db_items:

                item = Element('item')
                item.id = db_item.id
                item.name = db_item.name
                item.is_active = db_item.is_active
                item.host = db_item.host
                item.port = db_item.port
                item.user = db_item.user
                item.acct = db_item.acct
                item.timeout = db_item.timeout
                item.dircache = db_item.dircache

                item_list.append(item)

            return ZATO_OK, etree.tostring(item_list)
Esempio n. 22
0
    def getvalue(self, serialize=True):
        """ Gets the actual payload's value converted to a string representing
        either XML or JSON.
        """
        if self.zato_is_xml:
            if self.zato_output_repeated:
                value = Element('item_list')
            else:
                value = Element('item')
        else:
            if self.zato_output_repeated:
                value = []
            else:
                value = {}

        if self.zato_output_repeated:
            output = self.zato_output
        else:
            output = set(dir(self)) & self.zato_all_attrs
            output = [dict((name, getattr(self, name)) for name in output)]

        if output:

            # All elements must be of the same type so it's OK to do it
            is_sa_namedtuple = isinstance(output[0], KeyedTuple)

            for item in output:
                if self.zato_is_xml:
                    out_item = Element('item')
                else:
                    out_item = {}
                for is_required, name in chain(self.zato_required,
                                               self.zato_optional):
                    leave_as_is = isinstance(name, AsIs)
                    elem_value = self._getvalue(name, item, is_sa_namedtuple,
                                                is_required, leave_as_is)

                    if isinstance(name, ForceType):
                        name = name.name

                    if isinstance(elem_value, basestring):
                        elem_value = elem_value if isinstance(
                            elem_value,
                            unicode) else elem_value.decode('utf-8')

                    if self.zato_is_xml:
                        setattr(out_item, name, elem_value)
                    else:
                        out_item[name] = elem_value

                if self.zato_output_repeated:
                    value.append(out_item)
                else:
                    value = out_item

        if self.zato_is_xml:
            em = ElementMaker(annotate=False,
                              namespace=self.namespace,
                              nsmap={None: self.namespace})
            zato_env = em.zato_env(em.cid(self.zato_cid), em.result(ZATO_OK))
            top = getattr(em, self.response_elem)(zato_env)
            top.append(value)
        else:
            top = {self.response_elem: value}
            search = self.zato_meta.get('search')
            if search:
                top['_meta'] = search

        if serialize:
            if self.zato_is_xml:
                deannotate(top, cleanup_namespaces=True)
                return etree.tostring(top)
            else:
                return dumps(top)
        else:
            return top
Esempio n. 23
0
    def getvalue(self, serialize=True):
        """ Gets the actual payload's value converted to a string representing
        either XML or JSON.
        """
        if self.zato_is_xml:
            if self.zato_output_repeated:
                value = Element('item_list')
            else:
                value = Element('item')
        else:
            if self.zato_output_repeated:
                value = []
            else:
                value = {}

        if self.zato_output_repeated:
            output = self.zato_output
        else:
            output = set(dir(self)) & self.zato_all_attrs
            output = [dict((name, getattr(self, name)) for name in output)]

        if output:

            # All elements must be of the same type so it's OK to do it
            is_sa_namedtuple = isinstance(output[0], KeyedTuple)

            for item in output:
                if self.zato_is_xml:
                    out_item = Element('item')
                else:
                    out_item = {}
                for is_required, name in chain(self.zato_required, self.zato_optional):
                    leave_as_is = isinstance(name, AsIs)
                    elem_value = self._getvalue(name, item, is_sa_namedtuple, is_required, leave_as_is)

                    if isinstance(name, ForceType):
                        name = name.name

                    if isinstance(elem_value, basestring):
                        elem_value = elem_value if isinstance(elem_value, unicode) else elem_value.decode('utf-8')

                    if self.zato_is_xml:
                        setattr(out_item, name, elem_value)
                    else:
                        out_item[name] = elem_value

                if self.zato_output_repeated:
                    value.append(out_item)
                else:
                    value = out_item

        if self.zato_is_xml:
            em = ElementMaker(annotate=False, namespace=self.namespace, nsmap={None:self.namespace})
            zato_env = em.zato_env(em.cid(self.zato_cid), em.result(ZATO_OK))
            top = getattr(em, self.response_elem)(zato_env)
            top.append(value)
        else:
            top = {self.response_elem: value}

        if serialize:
            if self.zato_is_xml:
                deannotate(top, cleanup_namespaces=True)
                return etree.tostring(top)
            else:
                return dumps(top)
        else:
            return top
Esempio n. 24
0
    def convert(self,
                param,
                param_name,
                value,
                has_simple_io_config,
                is_xml,
                date_time_format=None):
        try:
            if any(
                    param_name.startswith(prefix)
                    for prefix in self.bool_parameter_prefixes) or isinstance(
                        param, Boolean):
                value = asbool(
                    value or None
                )  # value can be an empty string and asbool chokes on that

            if value and value is not None:  # Can be a 0
                if isinstance(param, Boolean):
                    value = asbool(value)

                elif isinstance(param, CSV):
                    value = value.split(',')

                elif isinstance(param, List):
                    if is_xml:
                        # We are parsing XML to create a SIO request
                        if isinstance(value, EtreeElement):
                            return [elem.text for elem in value.getchildren()]

                        # We are producing XML out of an SIO response
                        else:
                            wrapper = Element(param_name)
                            for item_value in value:
                                xml_item = Element('item')
                                wrapper.append(xml_item)
                                wrapper.item[-1] = item_value
                            return wrapper

                    # This is a JSON list
                    return value

                elif isinstance(param, Integer):
                    value = int(value)

                elif isinstance(param, Unicode):
                    value = unicode(value)

                elif isinstance(param, UTC):
                    value = value.replace('+00:00', '')

                else:
                    if value and value != ZATO_NONE and has_simple_io_config:
                        if any(param_name==elem for elem in self.int_parameters) or \
                           any(param_name.endswith(suffix) for suffix in self.int_parameter_suffixes):
                            value = int(value)

                if date_time_format and isinstance(value, datetime):
                    value = value.strftime(date_time_format)

            if isinstance(param, CSV) and not value:
                value = []

            return value
        except Exception, e:
            msg = 'Conversion error, param:[{}], param_name:[{}], repr(value):[{}], e:[{}]'.format(
                param, param_name, repr(value), format_exc(e))
            logger.error(msg)

            raise ZatoException(msg=msg)
Esempio n. 25
0
class Corpus(XMLBase):
    def __init__(self, xml_input=None, annotations=None):
        super().__init__("corpus", "document")
        self.corpus = Element("corpus")
        self.url_indices = []
        self.has_terms_locations = False
        self.nlp = stanza.Pipeline("en",
                                   processors={
                                       "tokenize": "gum",
                                       "ner": "default",
                                       "lemma": "gum",
                                       "pos": "gum",
                                       "depparse": "gum"
                                   },
                                   verbose=False,
                                   tokenize_no_ssplit=True)
        self.annotations = annotations.documents if annotations else None
        if xml_input:
            if xml_input and not os.path.exists(xml_input):
                raise FileNotFoundError(
                    f"{xml_input} not found. Check the path again.")
            elif os.path.isfile(xml_input):
                self.read_from_xml(xml_input)
            else:
                self.read_from_folder(xml_input)

    @staticmethod
    def unicodify(text):
        return text.replace("“", "\"")\
            .replace("”", "\"")\
            .replace("’", "'")\
            .replace("‘", "'") \
            .replace("\n", " ")

    def add_document(self,
                     url,
                     title,
                     categories,
                     published_time,
                     content,
                     author=None,
                     topics=None,
                     links=None,
                     terms=None,
                     document_id=None):
        if url is None or len(url) == 0:
            raise KeyError("'url' is mandatory")
        elif url in self.url_indices:
            log.info(f"Ignoring duplicate URL={url}")
            return
        new_document = Element("document")
        title = Corpus.unicodify(title)
        new_document.document_id = md5(title.encode("utf-8")).hexdigest()[-6:] if document_id is None or \
            len(document_id) == 0 else document_id
        new_document.url = url
        new_document.title = title
        new_document.author = author
        new_document.published_time = published_time
        # handle lists
        new_document.categories = Element("categories")
        if categories: new_document.categories.category = categories
        new_document.topics = Element("topics")
        if topics: new_document.topics.topic = topics
        new_document.links = Element("links")
        if links: new_document.links.link = links
        new_document.content = Element("content")
        if content:
            new_document.content.p = [
                Corpus.unicodify(p) for p in content if p
            ]
        # handle terms
        new_document.terms = Element("terms")
        terms_list = []
        if terms:
            for term in terms:
                term_elmt = Element("term")
                term_elmt.word = term
                term_elmt.locations = Element("locations")
                locations_list = []
                for location in terms[term]:
                    location_elmt = Element("location")
                    location_elmt.begin, location_elmt.end = location
                    locations_list.append(location_elmt)
                term_elmt.locations.location = locations_list
                terms_list.append(term_elmt)
            new_document.terms.term = terms_list
        self.corpus.append(new_document)
        self.url_indices.append(url)

    def add_document_from_element(self, document_elmt):
        # construct terms
        terms_list = {}
        if document_elmt.terms.countchildren() > 0:
            for term in document_elmt.terms.term:
                if term.locations.countchildren() > 0:
                    terms_list[term.word.text] = [
                        (loc.begin.text, loc.end.text)
                        for loc in term.locations.location
                    ]
        self.add_document(
            document_elmt.url.text,
            document_elmt.title.text,
            [category.text for category in document_elmt.categories.category]
            if document_elmt.categories.countchildren() > 0 else None,
            document_elmt.published_time.text,
            [p.text for p in document_elmt.content.p]
            if document_elmt.content.countchildren() > 0 else None,
            document_elmt.author.text,
            [topic.text for topic in document_elmt.topics.topic]
            if document_elmt.topics.countchildren() > 0 else None,
            [link.text for link in document_elmt.links.link]
            if document_elmt.links.countchildren() > 0 else None,
            terms_list if len(terms_list) > 0 else None,
            document_elmt.document_id,
        )

    def filter_empty(self):
        empty_document_list = []
        for document in self.iter_documents():
            if document.content.countchildren() == 0:
                empty_document_list.append(document)
        for document in empty_document_list:
            self.get_root().remove(document)
        return self

    def read_from_xml(self, input_path):
        composites = ["terms", "topics", "content", "links", "categories"]
        corpus_etree = etree.parse(input_path)
        corpus_root = corpus_etree.getroot()
        for document in corpus_root:
            new_document_attrs = {}
            annotated_terms = {}
            contain_terms_elmt = False
            for document_elmt in document:
                if document_elmt.tag == "category":
                    new_document_attrs[
                        "categories"] = document_elmt.text.split(
                            ";") if document_elmt.text else []
                elif document_elmt.tag == "terms":  # the document has existing annotations
                    for term_elmt in document_elmt:
                        word = None
                        locations = []
                        for item_elmt in term_elmt:
                            if item_elmt.tag == "word":
                                word = item_elmt.text
                            elif item_elmt.tag == "locations":
                                begin, end = None, None
                                for loc_elmt in item_elmt:
                                    for point_elmt in loc_elmt:
                                        if point_elmt.tag == "begin":
                                            begin = int(point_elmt.text)
                                        elif point_elmt.tag == "end":
                                            end = int(point_elmt.text)
                                    locations.append((begin, end))
                        annotated_terms[word] = locations
                        contain_terms_elmt = True
                elif document_elmt.tag in composites:
                    new_document_attrs[document_elmt.tag] = [
                        item.text for item in document_elmt
                    ]
                else:
                    new_document_attrs[document_elmt.tag] = document_elmt.text
            if self.annotations and new_document_attrs[
                    "document_id"] in self.annotations:  # annotation file
                new_document_attrs["terms"] = self.annotations[
                    new_document_attrs["document_id"]]
                self.add_document(**new_document_attrs)
                self.has_terms_locations = True  # at least 1 with terms
            elif contain_terms_elmt:  # there is no annotation file but terms element exist
                new_document_attrs["terms"] = annotated_terms
                self.add_document(**new_document_attrs)
                self.has_terms_locations = True
            elif self.annotations is None:  # there is no annotation file and no terms element
                self.add_document(**new_document_attrs)

    def read_from_folder(self, root_folder):
        in_folders = [
            folder for folder in os.listdir(root_folder)
            if os.path.isdir(os.path.join(root_folder, folder))
        ]
        for in_folder in in_folders:
            xml_files = [
                f for f in os.listdir(os.path.join(root_folder, in_folder))
                if f.endswith(".xml")
            ]
            for xml_file in xml_files:
                self.read_from_xml(
                    os.path.join(root_folder, in_folder, xml_file))

    def get_document_ids(self):
        return [document.document_id for document in self.iter_documents()]

    def get_sample(self, n, excluded_ids=None):
        sample_corpus = Corpus()
        indices = list(range(len(self)))
        random.shuffle(indices)
        acquired_count = 0
        i = 0
        while acquired_count < n and i < len(indices):
            document = self[indices[i]]
            i += 1
            document_id = document.document_id.text
            if excluded_ids and document_id in excluded_ids: continue
            sample_corpus.add_document_from_element(document)
            acquired_count += 1
        return sample_corpus

    def get_more_sample(self, n, json1_filename):
        existing_ids = []
        with open(json1_filename, "r") as json1_file:
            lines = json1_file.readlines()
        for line in lines:
            json_news = json.loads(line)
            current_id = md5(json_news["text"].split("|")[0].encode(
                "utf-8")).hexdigest()[-6:]
            existing_ids.append(current_id)
        return self.get_sample(n, existing_ids)

    def get_documents_by_ids(self, ids):
        subset_corpus = Corpus()
        for document in self:
            if document.document_id in ids:
                subset_corpus.add_document_from_element(document)
        return subset_corpus

    def get_documents_by_urls(self, urls):
        subset_corpus = Corpus()
        for document in self:
            if document.url.text in urls:
                subset_corpus.add_document_from_element(document)
        return subset_corpus

    def get_annotated_terms_as_csv(self, csv_path):
        with open(csv_path, "w") as csv_file:
            fieldnames = ["document_id", "terms"]
            csv_writer = DictWriter(csv_file, fieldnames)
            csv_writer.writeheader()
            for doc in self.iter_documents():
                document_id = doc.document_id.text
                all_terms = [term.word.text.lower() for term in doc.terms.term]
                csv_writer.writerow({
                    "document_id": document_id,
                    "terms": "|".join(all_terms)
                })
        return True

    def train_test_split(self, test_size, random_seed=1337):
        dev_c = Corpus()
        test_c = Corpus()
        n = len(self) * test_size
        indices = list(range(len(self)))
        random.seed(random_seed)
        random.shuffle(indices)
        i = 0
        while i < len(indices):
            document = self[indices[i]]
            if i < n:
                dev_c.add_document_from_element(document)
            else:
                test_c.add_document_from_element(document)
            i += 1
        return dev_c, test_c

    def annotate_sentence(self, sentence, buffer_offset, term_locs=None):
        term_state = ["O", "B-TERM", "I-TERM"]
        annotated_text = self.nlp(sentence)
        annotated_sentences = []
        head_dict = {0: "root"}
        for sentence in annotated_text.sentences:
            annotated_sentence = []
            for token in sentence.tokens:
                if len(token.words) > 1:
                    log.info(token)
                else:
                    word = token.words[0]
                    misc = dict(
                        token_misc.split("=")
                        for token_misc in word.misc.split("|"))
                    word_id = int(word.id)
                    head_dict[word_id] = word.text
                    start_char = buffer_offset + int(misc["start_char"])
                    end_char = buffer_offset + int(misc["end_char"])
                    annotations = {
                        "id": word_id,
                        "word": word.text,
                        "pos": word.xpos,
                        "lemma": word.lemma,
                        "deprel": word.deprel,
                        "deprel_head_id": word.head,
                        "character_offset_begin": start_char,
                        "character_offset_end": end_char,
                        "ner": token.ner
                    }
                    if term_locs is not None and len(term_locs) > 0:
                        annotations["term_tag"] = term_state[bisect(
                            term_locs, start_char) % 3]
                    annotated_sentence.append(annotations)
            for i, token in enumerate(annotated_sentence):
                token["deprel_head_text"] = head_dict[token["deprel_head_id"]]
                if "term_tag" in token:
                    # hacky way, should fix write_to_core_nlp_xmls insort usage
                    # if token["term_tag"][0] == "I" and (i == 0 or annotated_sentence[i-1]["term_tag"][0] == "O"):
                    #     if i == len(annotated_sentence) - 1 or annotated_sentence[i+1]["term_tag"][0] != "I":
                    #         token["term_tag"] = "S" + token["term_tag"][1:]
                    #     else:
                    #         token["term_tag"] = "B" + token["term_tag"][1:]
                    # el
                    if i == len(annotated_sentence) - 1 or annotated_sentence[
                            i + 1]["term_tag"][0] != "I":
                        if token["term_tag"][0] == "B":
                            token["term_tag"] = "S" + token["term_tag"][1:]
                        elif token["term_tag"][0] == "I":
                            token["term_tag"] = "E" + token["term_tag"][1:]
            annotated_sentences.append(annotated_sentence)
        return annotated_sentences

    def write_to_core_nlp_xmls(self, output_folder):
        for document in tqdm(self.iter_documents(), total=len(self)):
            document_id = document.document_id.text
            if f"{document_id}.xml" not in os.listdir(output_folder):
                buffer_offset = 0
                title = document.title.text
                term_locs = []
                if self.has_terms_locations:
                    for term in document.terms.term:
                        for location in term.locations.location:
                            insort(term_locs, int(location.begin.text) - 0.5)
                            insort(term_locs, int(location.begin.text) + 0.5)
                            insort(term_locs, int(location.end.text))
                annotated_title = self.annotate_sentence(
                    title, buffer_offset, term_locs)
                buffer_offset += len(title) + 1
                annotated_content = []
                for p in document.content.p:
                    if len(p.text.strip()) > 0:
                        text = p.text.strip()
                        p_sents = nltk.tokenize.sent_tokenize(text)
                        for p_sent in p_sents:
                            annotated_content += self.annotate_sentence(
                                p_sent, buffer_offset, term_locs)
                            buffer_offset += len(p_sent) + 1
                core_nlp_document = StanfordCoreNLPDocument()
                core_nlp_document.from_sentences(annotated_title,
                                                 annotated_content)
                core_nlp_document.write_xml_to(
                    os.path.join(output_folder, f"{document_id}.xml"))

    def write_to_jsonl(self, jsonl_path):
        # terms_found = False
        with open(jsonl_path, "w") as out_file:
            for document in self.iter_documents():
                # if document.terms.countchildren() > 0:
                #     labels = []
                #     for term in document.terms.term:
                #         for location in term.locations.location:
                #             labels.append([int(location.begin.text), int(location.end.text), "UNK"])
                doc_id = document.document_id.text
                text = {
                    "text":
                    "|".join([document.title.text] +
                             [p.text for p in document.content.p]),
                    "meta": {
                        "doc_id": doc_id
                    }
                }
                json.dump(html.unescape(text), out_file)
                out_file.write("\n")
Esempio n. 26
0
    def getvalue(self, serialize=True):
        """ Gets the actual payload's value converted to a string representing
        either XML or JSON.
        """
        if self.zato_is_xml:
            if self.zato_is_repeated:
                value = Element('item_list')
            else:
                value = Element('item')
        else:
            if self.zato_is_repeated:
                value = []
            else:
                value = {}

        if self.zato_is_repeated:
            output = self.zato_output
        else:
            output = set(dir(self)) & self.zato_all_attrs
            output = [dict((name, getattr(self, name)) for name in output)]
            
        if output:

            # All elements must be of the same type so it's OK to do it
            is_sa_namedtuple = isinstance(output[0], NamedTuple)
            
            for item in output:
                if self.zato_is_xml:
                    out_item = Element('item')
                else:
                    out_item = {}
                for is_required, name in chain(self.zato_required, self.zato_optional):
                    leave_as_is = isinstance(name, AsIs)
                    if isinstance(name, ForceType):
                        name = name.name

                    elem_value = self._getvalue(name, item, is_sa_namedtuple, is_required, leave_as_is)
                    
                    if self.zato_is_xml:
                        setattr(out_item, name, elem_value)
                    else:
                        out_item[name] = elem_value                    
    
                if self.zato_is_repeated:
                    value.append(out_item)
                else:
                    value = out_item
                        
        if self.zato_is_xml:
            top = Element(self.response_elem)
            top.append(value)
        else:
            top = {self.response_elem: value}

        if serialize:
            if self.zato_is_xml:
                return etree.tostring(top)
            else:
                return dumps(top)
        else:
            return top