Esempio n. 1
0
    def download_db(self):
        """ parse HMDB to Pandas.DataFrame

        """
        out_dir = os.path.join(self.tmp_dir, 'HMDB')
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)

        if len(os.listdir(out_dir)) == 0:
            self._unzip_hmdb(out_dir)

        logger.info("Parsing metabolites information from files")
        df = pd.DataFrame([
            self._create_dict(e) for ev, e in iter(
                ElementTree.iterparse(os.path.join(out_dir,
                                                   os.listdir(out_dir)[0]),
                                      events=("start", "end")))
            if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end'
        ])

        for i in categories:
            if i in df.columns:
                df[i.split('}')[1]] = df[i]
                del df[i]

        df.to_csv(self.out_name,
                  index=False,
                  encoding='utf-8',
                  compression='gzip',
                  tupleize_cols=True)
        logger.info("Done processing HMDB")
  def Parse(self, search_file, table_name, file_prefix=None):
    """Parser entry point.

    Parses the given POI file to POI elements, based on POI elements builds
    SQL statements for creating and populating POI table in POI database and
    triggers the DB updater to implement these SQL statements.

    Args:
      search_file: string containing absolute path to .poi file to be parsed.
      table_name: string containing name to use for POI table creation.
    Returns:
      num_fields: number of fields in search schema.
      sql_search: string containing SQL statement to execute for POI query.
      balloon_style: string containing associated balloon style.

    Raises:
      exceptions.SearchSchemaParserException exception.
      psycopg2.Warning/Error exceptions.
    """
    self._table_name = table_name
    self._file_prefix = file_prefix
    logger.info("Ingesting POI file %s into parser...", search_file)
    if file_prefix is None:
      logger.info("File prefix is None")
    else:
      logger.info("File prefix is '%s'", file_prefix)
    self.__StartDocument()
    try:
      context = ET.iterparse(search_file, SearchSchemaParser.EVENTS)
    except ET.ParseError, e:
      row, column = e.position
      raise exceptions.SearchSchemaParserException(
          "Unable to parse POI file %s."
          " A parsing error on row %d column %d: %s" % (
              search_file, row, column, e))
Esempio n. 3
0
 def parse(self):
     tree = ET.iterparse(self.file)
     current_element = self.new_element()
     for event, elem in tree:
         if event == 'end':
             if elem.tag == 'attribute':
                 self.attribute_text_to_data(elem, current_element)
             elif elem.tag == 'object':
                 if 'operation' in elem.attrib:
                     current_element['operation'] = elem.attrib['operation']
                 if 'name' in elem.attrib and 'ucmdb_id' in elem.attrib:
                     current_element['name'] = elem.attrib['name']
                     current_element['ucmdb_id'] = elem.attrib['ucmdb_id']
                     self.components[
                         current_element['ucmdb_id']] = current_element
                 current_element = self.new_element()
             elif elem.tag == 'link':
                 if 'operation' in elem.attrib:
                     current_element['operation'] = elem.attrib['operation']
                 if 'name' in elem.attrib and 'ucmdb_id' in elem.attrib:
                     current_element['name'] = elem.attrib['name']
                     current_element['ucmdb_id'] = elem.attrib['ucmdb_id']
                     if 'DiscoveryID1' in current_element['data']:
                         current_element['source_id'] = current_element[
                             'data']['DiscoveryID1']
                     if 'DiscoveryID2' in current_element['data']:
                         current_element['target_id'] = current_element[
                             'data']['DiscoveryID2']
                     self.relations[
                         current_element['ucmdb_id']] = current_element
                 current_element = self.new_element()
         elem.clear()
Esempio n. 4
0
def iterparse_elements(element_function, file_or_path, **kwargs):
    """
    Applies element_function to each of the sub-elements in the XML file.
    The passed in function must take at least one element, and an optional
    list of **kwarg which are relevant to each of the elements in the list:
        def elem_func(each_elem, **kwargs)

    Implements the recommended cElementTree iterparse pattern, which is
    efficient for reading in a file, making changes and writing it again.
    """

    if not hasattr(element_function, '__call__'):
        return

    file_path = getattr(file_or_path, 'name', file_or_path)
    context = iter(iterparse(file_path, events=('start', 'end')))
    root = None  # Capture root for Memory management

    # Start event loads child; by the End event it's ready for processing

    for event, child in context:
        if root is None:
            root = child
        if event == 'end':  # Ensures the element has been fully read
            element_function(child, **kwargs)
            root.clear()  # Descendants will not be accessed again
Esempio n. 5
0
File: s3c.py Progetto: segator/s3ql
    def list(self, prefix='', start_after=''):
        log.debug('started with %s, %s', prefix, start_after)

        keys_remaining = True

        # Without this, a call to list('foo') would result
        # in *prefix* being longer than *marker* - which causes
        # trouble for some S3 implementions (minio).
        if start_after:
            marker = self.prefix + start_after
        else:
            marker = ''
        prefix = self.prefix + prefix

        while keys_remaining:
            log.debug('requesting with marker=%s', marker)

            keys_remaining = None
            resp = self._do_request('GET', '/', query_string={ 'prefix': prefix,
                                                              'marker': marker,
                                                              'max-keys': 1000 })

            if not XML_CONTENT_RE.match(resp.headers['Content-Type']):
                raise RuntimeError('unexpected content type: %s' %
                                   resp.headers['Content-Type'])

            try:
                itree = iter(ElementTree.iterparse(self.conn, events=("start", "end")))
                (event, root) = next(itree)

                root_xmlns_uri = self._tag_xmlns_uri(root)
                if root_xmlns_uri is None:
                    root_xmlns_prefix = ''
                else:
                    # Validate the XML namespace
                    root_xmlns_prefix = '{%s}' % (root_xmlns_uri, )
                    if root_xmlns_prefix != self.xml_ns_prefix:
                        log.error('Unexpected server reply to list operation:\n%s',
                                  self._dump_response(resp, body=None))
                        raise RuntimeError('List response has %s as root tag, unknown namespace' % root.tag)

                for (event, el) in itree:
                    if event != 'end':
                        continue

                    if el.tag == root_xmlns_prefix + 'IsTruncated':
                        keys_remaining = (el.text == 'true')

                    elif el.tag == root_xmlns_prefix + 'Contents':
                        marker = el.findtext(root_xmlns_prefix + 'Key')
                        yield marker[len(self.prefix):]
                        root.clear()

            except GeneratorExit:
                # Need to read rest of response
                self.conn.discard()
                break

            if keys_remaining is None:
                raise RuntimeError('Could not parse body')
Esempio n. 6
0
File: xml.py Progetto: muchrons/vmc
def iter_elements_by_name(handle, name: str):
    events = cElementTree.iterparse(handle, events=(
        "start",
        "end",
    ))
    _, root = next(events)  # pylint: disable=stop-iteration-return
    for event, elem in events:
        if event == "end" and elem.tag == name:
            yield elem
            root.clear()
Esempio n. 7
0
    def download_db(self):
        """ parse HMDB to Pandas.DataFrame

        """
        out_dir = os.path.join(self.tmp_dir, 'HMDB')
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)

        if len(os.listdir(out_dir)) == 0:
            self._unzip_hmdb(out_dir)

        # count = 0

        print("Parsing metabolites information from files")
        # tmp_all = [None] * 1000000
        for i in os.listdir(out_dir):
            # turn file into an iteratorET
            # context = iter(ElementTree.iterparse(os.path.join(out_dir, i),
            #                                      events=("start-ns", "end")))
            #
            # # get the root element
            # _, _ = next(context)
            # # ElementTree.Element.keys()
            # for ev, e in context:
            #     if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end':
            #         tmp_all[count] = self._create_dict(e)
            #         count += 1
            #         # if count == 100:
            #         #     break

            context = iter(ElementTree.iterparse(os.path.join(out_dir, i),
                                                 events=("start", "end")))
            #
            # # get the root element
            _, _ = next(context)
            tmp_all = [
                self._create_dict(e) for ev, e in context
                if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end'
            ]
        # if count > 1000000:
        #     print("Need to allocate more space for HMDB data")
        # df = pd.DataFrame(tmp_all[:count])
        df = pd.DataFrame(tmp_all)

        for i in categories:
            if i in df.columns:
                df[i.split('}')[1]] = df[i]
                del df[i]
        print(df.head(10))
        for i in df.columns:
            print(i, df[i].head(10))

        df.to_csv(self.out_name, index=False, encoding='utf-8',
                  compression='gzip', tupleize_cols=True)
        print("Done processing HMDB")
Esempio n. 8
0
def is_svg(fp):
    fp.seek(0)
    tag = None
    try:
        for event, el in elementtree.iterparse(fp, ('start',)):
            tag = el.tag
            break
    except elementtree.ParseError:
        pass
    fp.seek(0)
    return tag == '{http://www.w3.org/2000/svg}svg'
Esempio n. 9
0
    def list(self, prefix='', start_after=''):
        log.debug('started with %s, %s', prefix, start_after)

        keys_remaining = True
        marker = self.prefix + start_after
        prefix = self.prefix + prefix
        ns_p = self.xml_ns_prefix

        while keys_remaining:
            log.debug('requesting with marker=%s', marker)

            keys_remaining = None
            resp = self._do_request('GET', '/', query_string={ 'prefix': prefix,
                                                              'marker': marker,
                                                              'max-keys': 1000 })

            if not XML_CONTENT_RE.match(resp.headers['Content-Type']):
                raise RuntimeError('unexpected content type: %s' %
                                   resp.headers['Content-Type'])

            try:
                itree = iter(ElementTree.iterparse(self.conn, events=("start", "end")))
                (event, root) = next(itree)

                for (event, el) in itree:
                    if event != 'end':
                        continue

                    if el.tag == ns_p + 'IsTruncated':
                        keys_remaining = (el.text == 'true')

                    elif el.tag == ns_p + 'Contents':
                        marker = el.findtext(ns_p + 'Key')
                        yield marker[len(self.prefix):]
                        root.clear()

            except Exception as exc:
                if is_temp_network_error(exc):
                    # We probably can't use the connection anymore
                    self.conn.disconnect()
                raise

            except GeneratorExit:
                # Need to read rest of response
                self.conn.discard()
                break

            if keys_remaining is None:
                raise RuntimeError('Could not parse body')
Esempio n. 10
0
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as badET
import defusedxml.cElementTree as goodET

xmlString = "<note>\n<to>Tove</to>\n<from>Jani</from>\n<heading>Reminder</heading>\n<body>Don't forget me this weekend!</body>\n</note>"

# unsafe
tree = badET.fromstring(xmlString)
print(tree)
badET.parse("filethatdoesntexist.xml")
badET.iterparse("filethatdoesntexist.xml")
a = badET.XMLParser()

# safe
tree = goodET.fromstring(xmlString)
print(tree)
goodET.parse("filethatdoesntexist.xml")
goodET.iterparse("filethatdoesntexist.xml")
a = goodET.XMLParser()
Esempio n. 11
0
    def _parse_OVF(self, ovf):
        """Parses the OVF file

        Parses the OVF file for specified metadata properties. Interested
        properties must be specified in ovf-metadata.json conf file.

        The OVF file's qualified namespaces are removed from the included
        properties.

        :param ovf: a file object containing the OVF file
        :returns: a tuple of disk filename and a properties dictionary
        :raises RuntimeError: an error for malformed OVF file
        """
        def _get_namespace_and_tag(tag):
            """Separate and return the namespace and tag elements.

            There is no native support for this operation in elementtree
            package. See http://bugs.python.org/issue18304 for details.
            """
            m = re.match(r'\{(.+)\}(.+)', tag)
            if m:
                return m.group(1), m.group(2)
            else:
                return '', tag

        disk_filename, file_elements, file_ref = None, None, None
        properties = {}
        for event, elem in ET.iterparse(ovf):
            if event == 'end':
                ns, tag = _get_namespace_and_tag(elem.tag)
                if ns in CIM_NS and tag in self.interested_properties:
                    properties[CIM_NS[ns] + '_' +
                               tag] = (elem.text.strip() if elem.text else '')

                if tag == 'DiskSection':
                    disks = [
                        child for child in list(elem)
                        if _get_namespace_and_tag(child.tag)[1] == 'Disk'
                    ]
                    if len(disks) > 1:
                        """
                        Currently only single disk image extraction is
                        supported.
                        FIXME(dramakri): Support multiple images in OVA package
                        """
                        raise RuntimeError(
                            _('Currently, OVA packages '
                              'containing multiple disk are '
                              'not supported.'))
                    disk = next(iter(disks))
                    file_ref = next(
                        value for key, value in disk.items()
                        if _get_namespace_and_tag(key)[1] == 'fileRef')

                if tag == 'References':
                    file_elements = list(elem)

                # Clears elements to save memory except for 'File' and 'Disk'
                # references, which we will need to later access
                if tag != 'File' and tag != 'Disk':
                    elem.clear()

        for file_element in file_elements:
            file_id = next(value for key, value in file_element.items()
                           if _get_namespace_and_tag(key)[1] == 'id')
            if file_id != file_ref:
                continue
            disk_filename = next(value for key, value in file_element.items()
                                 if _get_namespace_and_tag(key)[1] == 'href')

        return (disk_filename, properties)
Esempio n. 12
0
File: s3c.py Progetto: mkhon/s3ql
    def list(self, prefix='', start_after=''):
        log.debug('started with %s, %s', prefix, start_after)

        keys_remaining = True
        marker = self.prefix + start_after
        prefix = self.prefix + prefix

        while keys_remaining:
            log.debug('requesting with marker=%s', marker)

            keys_remaining = None
            resp = self._do_request('GET',
                                    '/',
                                    query_string={
                                        'prefix': prefix,
                                        'marker': marker,
                                        'max-keys': 1000
                                    })

            if not XML_CONTENT_RE.match(resp.headers['Content-Type']):
                raise RuntimeError('unexpected content type: %s' %
                                   resp.headers['Content-Type'])

            try:
                itree = iter(
                    ElementTree.iterparse(self.conn, events=("start", "end")))
                (event, root) = next(itree)

                root_xmlns_uri = self._tag_xmlns_uri(root)
                if root_xmlns_uri is None:
                    root_xmlns_prefix = ''
                else:
                    # Validate the XML namespace
                    root_xmlns_prefix = '{%s}' % (root_xmlns_uri, )
                    if root_xmlns_prefix != self.xml_ns_prefix:
                        log.error(
                            'Unexpected server reply to list operation:\n%s',
                            self._dump_response(resp, body=None))
                        raise RuntimeError(
                            'List response has %s as root tag, unknown namespace'
                            % root.tag)

                for (event, el) in itree:
                    if event != 'end':
                        continue

                    if el.tag == root_xmlns_prefix + 'IsTruncated':
                        keys_remaining = (el.text == 'true')

                    elif el.tag == root_xmlns_prefix + 'Contents':
                        marker = el.findtext(root_xmlns_prefix + 'Key')
                        yield marker[len(self.prefix):]
                        root.clear()

            except Exception as exc:
                if is_temp_network_error(exc):
                    # We probably can't use the connection anymore
                    self.conn.disconnect()
                raise

            except GeneratorExit:
                # Need to read rest of response
                self.conn.discard()
                break

            if keys_remaining is None:
                raise RuntimeError('Could not parse body')
Esempio n. 13
0
import xml.etree.cElementTree as badET
import defusedxml.cElementTree as goodET

xmlString = "<note>\n<to>Tove</to>\n<from>Jani</from>\n<heading>Reminder</heading>\n<body>Don't forget me this weekend!</body>\n</note>"

# unsafe
tree = badET.fromstring(xmlString)
print(tree)
badET.parse('filethatdoesntexist.xml')
badET.iterparse('filethatdoesntexist.xml')
a = badET.XMLParser()

# safe
tree = goodET.fromstring(xmlString)
print(tree)
goodET.parse('filethatdoesntexist.xml')
goodET.iterparse('filethatdoesntexist.xml')
a = goodET.XMLParser()
Esempio n. 14
0
    def _parse_OVF(self, ovf):
        """Parses the OVF file

        Parses the OVF file for specified metadata properties. Interested
        properties must be specified in ovf-metadata.json conf file.

        The OVF file's qualified namespaces are removed from the included
        properties.

        :param ovf: a file object containing the OVF file
        :returns: a tuple of disk filename and a properties dictionary
        :raises RuntimeError: an error for malformed OVF file
        """

        def _get_namespace_and_tag(tag):
            """Separate and return the namespace and tag elements.

            There is no native support for this operation in elementtree
            package. See http://bugs.python.org/issue18304 for details.
            """
            m = re.match(r'\{(.+)\}(.+)', tag)
            if m:
                return m.group(1), m.group(2)
            else:
                return '', tag

        disk_filename, file_elements, file_ref = None, None, None
        properties = {}
        for event, elem in ET.iterparse(ovf):
            if event == 'end':
                ns, tag = _get_namespace_and_tag(elem.tag)
                if ns in CIM_NS and tag in self.interested_properties:
                    properties[CIM_NS[ns] + '_' + tag] = (elem.text.strip()
                                                          if elem.text else '')

                if tag == 'DiskSection':
                    disks = [child for child in list(elem)
                             if _get_namespace_and_tag(child.tag)[1] ==
                             'Disk']
                    if len(disks) > 1:
                        """
                        Currently only single disk image extraction is
                        supported.
                        FIXME(dramakri): Support multiple images in OVA package
                        """
                        raise RuntimeError(_('Currently, OVA packages '
                                             'containing multiple disk are '
                                             'not supported.'))
                    disk = next(iter(disks))
                    file_ref = next(value for key, value in disk.items() if
                                    _get_namespace_and_tag(key)[1] ==
                                    'fileRef')

                if tag == 'References':
                    file_elements = list(elem)

                # Clears elements to save memory except for 'File' and 'Disk'
                # references, which we will need to later access
                if tag != 'File' and tag != 'Disk':
                    elem.clear()

        for file_element in file_elements:
            file_id = next(value for key, value in file_element.items()
                           if _get_namespace_and_tag(key)[1] == 'id')
            if file_id != file_ref:
                continue
            disk_filename = next(value for key, value in file_element.items()
                                 if _get_namespace_and_tag(key)[1] == 'href')

        return (disk_filename, properties)
Esempio n. 15
0
# This program searches within an xml file to find information,
# in this example the information of file "Users.xml" will be used to implement an SQL statement.
# Hope this will useful for you! - ernecto-ca

# Imports to read a xml and time
import defusedxml.cElementTree as goodET
from datetime import datetime

# Create the "tree" element of the xml
tree = goodET.parse('Users.xml')
goodET.iterparse('Users.xml')

# Get the information of the xml
root = tree.getroot()

# To check how long it took
now = datetime.now()
print(now)

# The main part of the sql
sql = "INSERT INTO users VALUES ("

#try: ***when it will insert in a data base****
for user in root:
    sql = sql + "'" + str(user.get('Id')).replace("'", "´") + "',"
    sql = sql + "'" + str(user.get('Reputation')).replace("'", "´") + "',"
    sql = sql + "'" + str(user.get('CreationDate')).replace("'", "´") + "',"
    sql = sql + "'" + str(user.get('DisplayName')).replace("'", "´") + "',"
    sql = sql + "'" + str(user.get('LastAccessDate')).replace("'", "´") + "',"
    sql = sql + "'" + str(user.get('WebsiteUrl')).replace("'", "´") + "',"
    sql = sql + "'" + str(user.get('Location')).replace("'", "´").replace(