Example #1
0
def xlsx(path):
    """ Returns a list of rows, where each row is a list of column values.
    """
    import zipfile
    from xml.etree.ElementTree import iterparse
    a = []
    r = {}
    v = ""
    z = zipfile.ZipFile(path)
    s = [e.text for x, e in iterparse(z.open("xl/sharedStrings.xml")) if e.tag.endswith("}t")]
    for x, e in iterparse(z.open("xl/worksheets/sheet1.xml")):
        if e.tag.endswith("}v"): # <v>84</v>
            v = e.text
        if e.tag.endswith("}c") \
         and e.attrib.get("t"):  # <c r="A3" t="s"><v>84</v></c>
            v = s[int(v)]
        if e.tag.endswith("}c"):
            c = e.attrib["r"]    # AZ22
            c = c.rstrip("0123456789")
            r[c], v = v, ""
        if e.tag.endswith("}row"):
            if any(r.values()):  # skip empty rows
                a.append(r)
            r = {}
    m = max([max(r.keys()) for r in a])
    for i, r in enumerate(a):    # fill empty cells
        for c in CELLS.split(m)[0] + m:
            r.setdefault(c, "")
        a[i] = [r[c] for c in sorted(r)]
    return a
Example #2
0
def xlsx(fname):
    import zipfile
    from xml.etree.ElementTree import iterparse

    zippy = zipfile.ZipFile(fname)
    try:
        words = [el.text for e, el in iterparse(zippy.open("xl/sharedStrings.xml")) if el.tag.endswith("}t")]
    except:
        words = {}
    rows = []
    row = {}
    val = ""
    for e, el in iterparse(zippy.open("xl/worksheets/sheet1.xml")):
        if el.tag.endswith("}v"):  # <v>84</v>
            val = el.text
        if el.tag.endswith("}c"):  # <c r="A3" t="s"><v>84</v></c>
            if el.attrib.get("t") == "s":
                val = words[int(val)]
            charac = el.attrib["r"]  # AZ22
            while charac[-1].isdigit():
                charac = charac[:-1]
            row[charac] = val
            val = ""
        if el.tag.endswith("}row"):
            rows.append(row)
            row = {}
    return rows
Example #3
0
def readXlsx(fileName, **args):
    # from: Hooshmand zandi http://stackoverflow.com/a/16544219
    import zipfile
    from xml.etree.ElementTree import iterparse

    if "sheet" in args:
        sheet = args["sheet"]
    else:
        sheet = 1
    if "header" in args:
        isHeader = args["header"]
    else:
        isHeader = False

    rows = []
    row = {}
    header = {}
    z = zipfile.ZipFile(fileName)

    # Get shared strings
    strings = [el.text for e, el in iterparse(z.open("xl/sharedStrings.xml")) if el.tag.endswith("}t")]
    value = ""

    # Open specified worksheet
    for e, el in iterparse(z.open("xl/worksheets/sheet%d.xml" % (sheet))):
        # get value or index to shared strings
        if el.tag.endswith("}v"):  # <v>84</v>
            value = el.text
        if el.tag.endswith("}c"):  # <c r="A3" t="s"><v>84</v></c>
            # If value is a shared string, use value as an index

            if el.attrib.get("t") == "s":
                value = strings[int(value)]

            # split the row/col information so that the row leter(s) can be separate
            letter = el.attrib["r"]  # AZ22
            while letter[-1].isdigit():
                letter = letter[:-1]

            # if it is the first row, then create a header hash for the names
            # that COULD be used
            if rows == []:
                header[letter] = value.strip()
            else:
                if value != "":

                    # if there is a header row, use the first row's names as the row hash index
                    if isHeader == True and letter in header:
                        row[header[letter]] = value
                    else:
                        row[letter] = value

            value = ""
        if el.tag.endswith("}row"):
            rows.append(row)
            row = {}
    z.close()
    return [header, rows]
Example #4
0
def read_xlsx(file, **args):
    # type: (typing.Any, **typing.Any) -> typing.Tuple[typing.Dict[typing.Any, str], typing.List[typing.Dict[str, str]]]
    # from: Hooshmand zandi http://stackoverflow.com/a/16544219
    import zipfile
    from xml.etree.ElementTree import iterparse

    sheet = args.get("sheet", 1)
    is_header = args.get("header", False)

    rows = []  # type: typing.List[typing.Dict[str, str]]
    row = {}
    header = {}
    z = zipfile.ZipFile(file)

    # Get shared strings
    strings = [el.text for e, el
               in iterparse(z.open('xl/sharedStrings.xml'))
               if el.tag.endswith('}t')
               ]  # type: typing.List[str]
    value = ''

    # Open specified worksheet
    for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml' % sheet)):
        # get value or index to shared strings
        if el.tag.endswith('}v'):                                   # <v>84</v>
            value = el.text
        if el.tag.endswith(
                '}c'):                                   # <c r="A3" t="s"><v>84</v></c>
            # If value is a shared string, use value as an index

            if el.attrib.get('t') == 's':
                value = strings[int(value)]

            # split the row/col information so that the row letter(s) can be separate
            letter = el.attrib['r']  # type: str         # AZ22
            while letter[-1].isdigit():
                letter = letter[:-1]

            # if it is the first row, then create a header hash for the names that COULD be used
            if not rows:
                header[letter] = value.strip()
            else:
                if value != '':
                    # if there is a header row, use the first row's names as the row hash index
                    if is_header is True and letter in header:
                        row[header[letter]] = value
                    else:
                        row[letter] = value

            value = ''
        if el.tag.endswith('}row'):
            rows.append(row)
            row = {}
    z.close()
    return header, rows
def readXlsx(fileName,**args):
 import zipfile
 from xml.etree.ElementTree import iterparse
 if "sheet" in args:
    sheet=args["sheet"]
 else:
    sheet=1
 if "header" in args:
    isHeader=args["header"]
 else:
    isHeader=False

 rows = []
 row = {}
 header = {}
 z=zipfile.ZipFile(fileName)
 # Get shared strings
 strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t')]
 value = ''

 # Open specified worksheet
 for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml'%(sheet))):
    # get value or index to shared strings
    if el.tag.endswith('}v'): # <v>84</v>
        value = el.text
    if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c>
        # If value is a shared string, use value as an index
        if el.attrib.get('t') == 's':
            value = strings[int(value)]
        # split the row/col information so that the row leter(s) can be separate
        letter = el.attrib['r'] # AZ22
        while letter[-1].isdigit():
            letter = letter[:-1]
        # if it is the first row, then create a header hash for the names
        # that COULD be used
        if rows ==[]:
            header[letter]=value
        else:
            if value != '': 
                # if there is a header row, use the first row's names as the row hash index
                if isHeader == True and letter in header:
                    row[header[letter]] = value
                else:
                    row[letter] = value

        value = ''
    if el.tag.endswith('}row'):
        rows.append(row)
        row = {}
 z.close()
 return rows
def main():
    limited_tags = ['jquery','javascript','python']
    con = lite.connect('bigdata.db')
##    tree = ET.parse('Posts.xml')
##    root = tree.getroot()

    # get an iterable
    context = iterparse('Posts.xml', events=("start", "end"))
    # turn it into an iterator
    context = iter(context)
    # get the root element
    event, root = context.next()

    with con:
        # Commented sections below create a separate table for tags
        #tags_dict = {}
        cur = con.cursor()    
        cur.execute("CREATE TABLE SO(Id INTEGER PRIMARY KEY ASC, Tags TEXT, CreationDate TEXT, UserID INTEGER)")
        #cur.execute("CREATE TABLE TAGS(Id INTEGER PRIMARY KEY ASC, Tag TEXT)")
        #tag_id = 0
        for event, child in context:
            if event == "end" and 'Title' in child.attrib and 'OwnerUserId' in child.attrib and (limited_tags[0] in child.attrib['Tags'] or limited_tags[1] in child.attrib['Tags'] or limited_tags[2] in child.attrib['Tags']):
                sqlQuery = "INSERT INTO SO VALUES(?,?,?,?)"
                cur.execute(sqlQuery,(child.attrib['Id'],child.attrib['Tags'],child.attrib['CreationDate'],child.attrib['OwnerUserId']))
                # tags = child.attrib['Tags'].replace('<','').split('>')[:-1]
                # for tag in tags:
                #     if not tag in tags_dict:
                #         tags_dict[tag] = tag_id
                #         tag_id+=1
                root.clear()
 
        # sqlQuery = "INSERT INTO TAGS VALUES(?,?)"
        # for tag in tags_dict:
        #     cur.execute(sqlQuery,(tags_dict[tag],tag))

    # get an iterable
    context = iterparse('Users.xml', events=("start", "end"))
    # turn it into an iterator
    context = iter(context)
    # get the root element
    event, root = context.next()

    with con:
        cur = con.cursor()    
        cur.execute("CREATE TABLE USERS(UserID INTEGER PRIMARY KEY ASC, Location TEXT)")

        for event, child in context:
            if event == "end" and 'Location' in child.attrib:
                sqlQuery = "INSERT INTO USERS VALUES(?,?)"
                cur.execute(sqlQuery,(child.attrib['Id'],child.attrib['Location']))
                root.clear()
Example #7
0
def wait_for_new_job(sasl_token):
    # https://developers.google.com/cloud-print/docs/rawxmpp
    import ssl, socket
    from xml.etree.ElementTree import iterparse, tostring

    xmpp = ssl.wrap_socket(socket.socket())
    xmpp.connect(("talk.google.com", 5223))
    parser = iterparse(xmpp, ("start", "end"))

    def msg(msg=" "):
        xmpp.write(msg)
        stack = 0
        for event, el in parser:
            if event == "start" and el.tag.endswith("stream"):
                continue
            stack += 1 if event == "start" else -1
            if stack == 0:
                assert (
                    not el.tag.endswith("failure") and not el.tag.endswith("error") and not el.get("type") == "error"
                ), tostring(el)
                return el

    msg('<stream to="gmail.com" version="1.0" xmlns="http://etherx.jabber.org/streams">')
    msg('<auth xmlns="urn:ietf:params:xml:ns:xmpp-sasl" mechanism="X-GOOGLE-TOKEN">%s</auth>' % sasl_token)
    msg('<s:stream to="gmail.com" version="1.0" xmlns:s="http://etherx.jabber.org/streams" xmlns="jabber:client">')
    iq = msg('<iq type="set"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind"><resource>Armooo</resource></bind></iq>')
    bare_jid = iq[0][0].text.split("/")[0]
    msg(
        '<iq type="set" to="%s"><subscribe xmlns="google:push"><item channel="cloudprint.google.com" from="cloudprint.google.com"/></subscribe></iq>'
        % bare_jid
    )
    return msg()
Example #8
0
def upgrade_nrml(directory, dry_run):
    """
    Upgrade all the NRML files contained in the given directory to the latest
    NRML version. Works by walking all subdirectories.
    WARNING: there is no downgrade!
    """
    for cwd, dirs, files in os.walk(directory):
        for f in files:
            path = os.path.join(cwd, f)
            if f.endswith('.xml'):
                ip = iterparse(path)
                try:
                    fulltag = ip.next()[1].tag
                    xmlns, tag = fulltag.split('}')
                except:  # not a NRML file
                    pass
                if xmlns[1:] == NRML05:  # already upgraded
                    pass
                elif 'nrml/0.4' in xmlns and 'vulnerability' in f:
                    if not dry_run:
                        print('Upgrading', path)
                        try:
                            upgrade_file(path)
                        except Exception as exc:
                            print(exc)
                    else:
                        print('Not upgrading', path)
                ip._file.close()
def parse_and_remove(filename,path):
    path_parts=path.split('/')
    doc = iterparse(filename,('start', 'end'))
    #skip the root element
    next(doc)
    
    tag_stack = []
    elem_stack = []
    for event, elem in doc:
        if event == 'start':
            tag_stack.append(elem.tag)
            elem_stack.append(elem)
            print("start.\n")
            print("tag_stack:",tag_stack,"\n")
            print("elem_stack",elem_stack,"\n")
        elif event == 'end':
            if tag_stack == path_parts:
                print("end.\n")
                print("elem:",elem)
                yield elem
                print("elem_stack[-2]",elem_stack[-2])
                elem_stack[-2].remove(elem)
            try:
                tag_stack.pop()
                elem_stack.pop()
            except IndexError:
                pass
Example #10
0
 def loadScheme(self):
     que = []
     scheme = self.feed.output_scheme
     map_file = self.feed.map_rules if self.feed.map_rules else ''
     
     if scheme == None:
         return
     filepath = os.path.join("schemas",scheme, "schema.xml")
     for (event, node) in iterparse(filepath, ['start', 'end']):
         if event == 'end':
             que.pop()
         if event == 'start':
             que.append(node.tag)
             if not list(node):
                 o = struct()
                 o.xpath = "/".join(que[1:])
                 o.tag = node.tag
                 o.desc = node.text
                 self.schema.append(o)
             else:
                 if len(que) == 1:
                     o = struct()
                     o.xpath = "/".join(que)
                     o.tag = node.tag
                     self.schema_root = o
                 elif len(que) == 2:
                     o = struct()
                     o.xpath = "/".join(que)
                     o.tag = node.tag
                     self.schema_container = o
Example #11
0
def xml2sqlite(bron):
	cur=conn.cursor()
	cur.execute("drop table if exists elems")
	cur.execute("drop table if exists attrib")
	cur.execute("Create table elems (id integer,id_parent integer,tag varchar(100),text varchar(100), attrib text)")
	cur.execute("Create table attrib (id integer,id_elem integer,name varchar(100),value varchar(100))")
	niveau=0;num_0=0;eventvorig='init';attrib=0;attribvorig=0;
	for (event, node) in iterparse(bron, ['start', 'end', 'start-ns', 'end-ns']):
		if event=='end':
			niveau=niveau-1
		if event=='start':
			exec "numvorig_%s=num_%s"%(niveau,niveau)
			exec "elemvorig=niveau*10000000+numvorig_%s"%(niveau)
			niveau=niveau+1
			exec "if 'num_%s' not in locals(): num_%s=0"%(niveau,niveau)
			exec "num_%s=num_%s+1"%(niveau,niveau)
			exec "elem=niveau*10000000+num_%s"%(niveau)
			tag=node.tag
			if node.text:
				text=node.text.rstrip()
			else: text=''
			if node.keys():
				dict1={}
				for name in node.keys(): 
					attribvorig=attrib;attrib=attrib+1
					value=node.attrib.get(name)
					cur.execute("insert into attrib values ('%s','%s','%s','%s')"%(attrib,elem,name,value))
					dict1[name]=value
				attrib2=str(dict1)
			else:
				attrib2=''
			cur.execute("insert into elems values ('%s','%s','%s','%s',\"%s\")"%(elem,elemvorig,tag,text,attrib2))
		eventvorig=event
def parse_and_remove(filename, out):

    doc = iterparse(filename, ('start', 'end'))
    categories = {}
    questions = {}

    for event, elem in doc:
        if event == 'end':
            if elem.tag == 'message':
                if 'QID' in elem.text and 'TITLE' in elem.text and 'BODY' in elem.text and 'CATEGORY' in elem.text:
                    start_ind = elem.text.rfind('CATEGORY:')
                    if start_ind != -1:
                        cat = elem.text[start_ind+len('CATEGORY:'):].strip()
                        if not categories.get(cat):
                            categories[cat] = 1
                            questions[cat] = [elem.text]
                        else:
                            categories[cat] += 1
                            questions[cat].append(elem.text)

    print(categories)
    with open(out, 'w') as outfile:
    	for item in sorted(questions.items(), key=lambda x: x[0]):
          outfile.write('***%s***\n' % item[0])
          for q in item[1]:
            outfile.write('%s\n' % q)
Example #13
0
def is_attrib_unique(filename,attrib):
    id_count=0
    unique_id=0
    dupe_id=0
    ids = defaultdict(int)
    # First fill a dictionary with count for unique id
    for (_, node) in iterparse(filename, ['start',]):
        if node.tag == 'way' or node.tag == 'node':
            for attr in dict(node.attrib):
                if attr == attrib:
                    id_count += 1
                    ids[node.attrib[attrib]] += 1
        node.clear()
    # Then, if count > 1 then there is a duplicate
    for k,v in ids.items():
        if v>1:
            dupe_id += 1
        else:
            unique_id += 1
    print 'Uid found: ' + str(id_count)
    print 'Unique uids: ' + str(unique_id)
    print 'Duplicate uids: ' + str(dupe_id)
    if dupe_id == 0:
        return True
    else:
        return False
Example #14
0
def wait_for_new_job(sasl_token):
    # https://developers.google.com/cloud-print/docs/rawxmpp
    import ssl, socket
    from xml.etree.ElementTree import iterparse, tostring
    xmpp = ssl.wrap_socket(socket.socket())
    xmpp.connect(('talk.google.com', 5223))
    parser = iterparse(xmpp, ('start', 'end'))
    def msg(msg=' '):
        xmpp.write(msg)
        stack = 0
        for event, el in parser:
            if event == 'start' and el.tag.endswith('stream'):
                continue
            stack += 1 if event == 'start' else -1
            if stack == 0:
                assert not el.tag.endswith('failure') and not el.tag.endswith('error') and not el.get('type') == 'error', tostring(el)
                return el
    msg('<stream:stream to="gmail.com" xml:lang="en" version="1.0" xmlns:stream="http://etherx.jabber.org/streams" xmlns="jabber:client">')
    msg('<auth xmlns="urn:ietf:params:xml:ns:xmpp-sasl" mechanism="X-GOOGLE-TOKEN" auth:allow-generated-jid="true" auth:client-uses-full-bind-result="true" xmlns:auth="http://www.google.com/talk/protocol/auth">%s</auth>' % sasl_token)
    msg('<stream:stream to="gmail.com" xml:lang="en" version="1.0" xmlns:stream="http://etherx.jabber.org/streams" xmlns="jabber:client">')
    iq = msg('<iq type="set" id="0"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind"><resource>Armooo</resource></bind></iq>')
    bare_jid = iq[0][0].text.split('/')[0]
    msg('<iq type="set" id="2"><session xmlns="urn:ietf:params:xml:ns:xmpp-session"/></iq>')
    msg('<iq type="set" id="3" to="%s"><subscribe xmlns="google:push"><item channel="cloudprint.google.com" from="cloudprint.google.com"/></subscribe></iq>' % bare_jid)
    return msg()
Example #15
0
    def __getitem__(self, identifier):
        """
        Access the item with id 'identifier' in the file by iterating the xml-tree.

        Arguments:
            identifier (str): native id of the item to access

        Returns:
            data (str): text associated with the given identifier
        """
        old_pos = self.file_handler.tell()
        self.file_handler.seek(0, 0)
        mzml_iter = iter(iterparse(self.file_handler, events=['end']))
        while True:
            event, element = next(mzml_iter)
            if event == 'end':
                if element.tag.endswith('}spectrum'):
                    if int(regex_patterns.SPECTRUM_ID_PATTERN.search(
                            element.get('id')).group(1)) == identifier:
                        self.file_handler.seek(old_pos, 0)
                        return spec.Spectrum(element, measured_precision=5e-6)
                elif element.tag.endswith('}chromatogram'):
                    if element.get('id') == identifier:
                        self.file_handler.seek(old_pos, 0)
                        return spec.Chromatogram(
                            element,
                            measured_precision=5e-6
                        )
Example #16
0
 def testCHelper(self):
     log.setLevel(llC)
     _log.info("------- Helper test --------")
     a = "{kuku}lala"
     helper = metadata.HELPER[metadata.GET_TAG]
     self.assertEqual(helper(a), "lala")
     helper = metadata.HELPER[metadata.SPLIT_TAG]
     self.assertEqual(helper(a), ["kuku", "lala"])
     e = Element('test', {'{a}b':'c','{d12}d':'{e}[]','{d?"?E#}f':'{}g'})
     helper = metadata.HELPER[metadata.SPLIT_ATTRIBS]
     res = helper(e)
     for k, v in e.attrib.items():
         t = k.split("}")[1]
         self.assertEqual(v,res[t])
     e.text = "    test\nlala    "
     helper = metadata.HELPER[metadata.GET_TEXT]
     self.assertEqual(helper(e), "    test\nlala    ")
     e.text = "    \n    "
     self.assertEqual(helper(e), "")
     e.text = ""
     c = SubElement(e, metadata.TAG_NAME)
     c.text = "This text describes the parent"
     helper = metadata.HELPER[metadata.TAG_SUPPRESS]
     self.assertEqual(helper(c, e), ('test', 'This text describes the parent'))
     inputData = StringIO("<root><Welookfor><Name>data</Name></Welookfor></root>")
     events = ("start", "end")
     iterator = iterparse(inputData, events=events)
     for event, elem in iterator:
         if event == "end":
             if elem.tag == "Name":
                 self.assertEqual(helper(elem, iterator.next()[1]), ('Welookfor', 'data'))
Example #17
0
    def parseXML(self, file_xml, folder_conteudo, export_version):
        """Parse XML.
        https://github.com/zikzakmedia/python-mediawiki
        """
        context = aq_inner(self.context)
        utils = getToolByName(context, 'plone_utils')

        NS = '{http://www.mediawiki.org/xml/export-' + export_version + '/}'

        conteudo = []

        with open(file_xml.name) as f:
            for event, elem in iterparse(f):
                if elem.tag == '{0}page'.format(NS):
                    title = elem.find("{0}title".format(NS))
                    contr = elem.find(".//{0}username".format(NS))
                    text = elem.find(".//{0}text".format(NS))
                    if (title is not None) and (contr is not None) and (text is not None):
                        text = unicode(text.text).encode('utf-8')
                        text = wiki2html(text, True)
                        conteudo.append(dict(title=title.text, contr=contr.text, text=text))
                    elem.clear()

        self.createDocument(conteudo, folder_conteudo)

        msg = 'Procedimento executado.'
        utils.addPortalMessage(msg, type='info')
Example #18
0
def show_all_event():
    """event-based parsing"""
    from xml.etree.ElementTree import iterparse

    depth = 0
    prefix_width = 8
    prefix_dots = '.' * prefix_width
    line_template = '{prefix:<0.{prefix_len}}{event:<8}{suffix:<{suffix_len}} {node.tag:<12} {node_id}'

    for (event, node) in iterparse('podcasts.opml', ['start', 'end', 'start-ns', 'end-ns']):
        if event == 'end':
            depth -= 1

        prefix_len = depth * 2

        print line_template.format(prefix=prefix_dots,
                                   prefix_len=prefix_len,
                                   suffix='',
                                   suffix_len=(prefix_len - prefix_len),
                                   node=node,
                                   node_id=id(node),
                                   event=event)

        if event == 'start':
            depth += 1
Example #19
0
def upgrade_nrml(directory, dry_run):
    """
    Upgrade all the NRML files contained in the given directory to the latest
    NRML version. Works by walking all subdirectories.
    WARNING: there is no downgrade!
    """
    for cwd, dirs, files in os.walk(directory):
        for f in files:
            path = os.path.join(cwd, f)
            if f.endswith('.xml'):
                ip = iterparse(path, events=('start',))
                next(ip)  # read node zero
                try:
                    fulltag = next(ip)[1].tag  # tag of the first node
                    xmlns, tag = fulltag.split('}')
                except:  # not a NRML file
                    xmlns, tag = '', ''
                if xmlns[1:] == NRML05:  # already upgraded
                    pass
                elif 'nrml/0.4' in xmlns and (
                        'vulnerability' in tag or 'fragility' in tag or
                        'sourceModel' in tag):
                    if not dry_run:
                        print('Upgrading', path)
                        try:
                            upgrade_file(path)
                        except Exception as exc:
                            raise
                            print(exc)
                    else:
                        print('Not upgrading', path)
                ip._file.close()
Example #20
0
def read_xml_file(file_name, base_trace_num=0):
    ''' Read the xml file and return the root element and a dictionary tuple
        The Dictionary from element to tuple of line_num and trace id '''
    if file_name is None or file_name == '':
        return (None,{})
    file = FileWithLineNum(open(file_name))
    out_dict = {}
    root_element = None
    trace_nums = [0]*102    # Can't conceive of having > 100 levels
    trace_nums[0] = base_trace_num
    trace_idx = -1

    for event, element in iterparse(file, events=["start", "end"]):
        if root_element is None:
            root_element = element
        if event == "start":
            trace_idx += 1
            trace_nums[trace_idx] += 1
            trace_id = '.'.join([ str(x) for x in trace_nums[:trace_idx+1]])
            out_dict[element] = (file.line_num, trace_id)
            #print out_dict[element]
        else:
            trace_nums[trace_idx+1] = 0 # Restart one level up
            trace_idx -= 1
    return (root_element, out_dict)
Example #21
0
    def parse_and_remove(self, filename, path):

        print('********')
        from xml.etree.ElementTree import iterparse

        path_parts = path.split('/')
        doc = iterparse(filename, ('start', 'end')) # Skip the root element
        print(path_parts)
        next(doc)
        tag_stack = []
        elem_stack = []
        for event, elem in doc:
            print(event)
            print(elem)
            if event == 'start':
                tag_stack.append(elem.tag)
                elem_stack.append(elem)
            elif event == 'end':
                if tag_stack == path_parts:
                    yield elem
                    elem_stack[-2].remove(elem)
                try:
                    tag_stack.pop()
                    elem_stack.pop()
                except IndexError as e:
                    print(e)
                    pass
def importXML(path):    
  header = open(path).readline()
  start = header.find('xmlns=')+7  
  NS = "{%s}" % header[start: header.find('\"', start)]  
  allInfo=[] #to store all the concised info
  myBase='' #to store the base web
  with open(path) as f:
    for event, elem in iterparse(f):      
      # print elem.tag #each elem has its own tag      
      if elem.tag == '{0}base'.format(NS):
        myBase = str(elem.text)

      if elem.tag == '{0}page'.format(NS):        
        
        title = elem.find("{0}title".format(NS))
        contr = elem.find(".//{0}username".format(NS))
        content = elem.find(".//{0}text".format(NS))

        token_dic={} #to parse the content into many tokens and store in the dictionary
        if content is not None:            
            tokenizer = RegexpTokenizer(r'\w+') #so can get rid of punctuation
            # print tokenizer.tokenize(content.text)                                    
            for eachword in tokenizer.tokenize(content.text):                            
              try:
                token_dic[eachword.lower()] += 1
              except:
                token_dic[eachword.lower()] = 1

        allInfo.append((title.text, token_dic, content.text))
        elem.clear()  
  
  return myBase, allInfo
Example #23
0
def read_corpus(corpus_file_path, sections=['text']):
    for event, elem in iterparse(corpus_file_path):
        if elem.tag == 'item':
            values = [elem.find(section).text for section in sections]
            if not all(values):
                continue

            rating_text = elem.find('rating')
            if rating_text is not None:
                rating_text = rating_text.text
                rating = float(rating_text.strip())
                if rating < 3:
                    label = 0
                else:
                    label = 1
            else:
                rating_text = elem.find('polarity')
                if rating_text is None:
                    label = -1
                elif rating_text.text.strip() == 'N':
                    label = 0
                else:
                    label = 1

            yield values, label
Example #24
0
def unpack( xml ):
    for (event, elem) in iterparse(xml, ['start', 'end', 'start-ns', 'end-ns']):
        if event == 'end':
            if elem.tag == FOLDER:
                os.chdir(os.pardir)
        if event == 'start':
            print "working for ...", elem.attrib[NAME]
            if elem.tag == FILE:
                size = int(elem.attrib[SIZE])
                block = size / contentLength
                remdr = size % contentLength
                file = open(elem.attrib[NAME], 'a')
                for blockIndex in range(0, block):
                    file.write(content)
                for remdrIndex in range(0, remdr):
                    file.write("X")
                file.close()
            if elem.tag == FOLDER:
                os.mkdir(elem.attrib[NAME])
                os.chdir(elem.attrib[NAME])
            if elem.tag == ROOT:
                shutil.rmtree(elem.attrib[NAME], ignore_errors=True)
                os.mkdir(elem.attrib[NAME])
                os.chdir(elem.attrib[NAME])
    return 0;
Example #25
0
    def scan_eix_xml(self, query, category=None):
        cmd = ['eix', '--xml']
        env = os.environ
        env['XML_OVERLAY'] = 'true'
        if query:
            cmd.extend(['--exact', query])
        if category:
            cmd.extend(['-C', category])

        sub = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE)
        output = sub.stdout

        try:
            parser = iterparse(output, ["start", "end"])
            parser.next()  # read root tag just for testing output
        except ParseError:
            if query:
                msg = "Unknown package '%s'" % query
            else:
                msg = "No packages."
            self.logger.error(self.style.ERROR(msg))
            return

        package = {'versions': []}
        category = ""

        for event, elem in parser:
            if event == "start":  # on tag opening
                if elem.tag == "category":
                    category = elem.attrib["name"]
                elif elem.tag == "package":
                    package["package"] = elem.attrib["name"]
                    package["category"] = category
                elif elem.tag in ["description", "homepage"]:
                    package[elem.tag] = elem.text or ""
                elif elem.tag == "version":
                    # append version data to versions
                    cpv = "%s/%s-%s" % (
                        package["category"],
                        package["package"],
                        elem.attrib["id"]
                    )
                    slot = elem.attrib.get("slot", "0")
                    overlay = elem.attrib.get("repository", "gentoo")
                    overlay_path = elem.attrib.get("overlay", None)
                    package["versions"].append(
                        (cpv, slot, overlay, overlay_path)
                    )

            elif event == "end":  # on tag closing
                if elem.tag == "package":
                    # clean old data
                    yield package
                    package = {"versions": []}

                if elem.tag == "category":
                    # clean old data
                    category = ""
            elem.clear()
Example #26
0
def get_concept_treatment_info(cid):
    response = get_data("get", cid)
    diseases = []
    # Beatiful Soup version
    soup = BeautifulSoup(response, "xml")
    for tag in soup.find_all('role'): #, recursive=False):
        for tag_inner in tag.children:
            #print tag_inner.name
            if tag_inner.name == "roleName":
                word = tag_inner.string
		if word.startswith("may_treat"):
                    flag = 0.5 
		elif word.startswith("treats"):
                    flag = 1 
                else:
                    flag = 0 
            elif tag_inner.name == "concept":
                for child in tag_inner.children:
                    #print child.name
                    if child.name == "conceptName":
                        disease = child.string
                    if child.name == "conceptKind":
                        if child.string.strip() != "DISEASE_KIND":
                            flag = 0
                    if child.name == "conceptNui":
                        disease_cid = child.string
                if flag > 0:
                    diseases.append((disease, disease_cid, flag))
    return diseases
    # ElemTree version
    context = iterparse(response, ["start", "end"])
    context = iter(context)
    event, root = context.next()
    NS = ""
    state_stack = [ root.tag ]
    flag = False
    for (event, elem) in context:
        if event == "start":
            state_stack.append(elem.tag)
            if elem.tag == NS+"role":
                flag = False
        elif event == "end":
            if elem.tag == NS+"roleName":
                if state_stack[-2] == NS+"role":
                    word = elem.text
                    if word.startswith("may_treat") or word.startswith("treats"):
                        flag = True
                    else:
                        flag = False
            if elem.tag == NS+"conceptName":
                if state_stack[-2] == NS+"concept" and state_stack[-3] == NS+"role":
                    if flag:
                        diseases.append(elem.text)
            elem.clear()
            state_stack.pop()
    root.clear()
    return diseases
Example #27
0
 def get_items(self):
     self.input_file.seek(0)
     for event, elem in iterparse(self.input_file):
         if elem.tag == 'item':
             out = parse_post(elem)
             out['comments'] = get_comments(elem)
             out['categories'] = get_categories(elem)
             yield out
             elem.clear()
Example #28
0
 def sax_parse(self, filename):
     self.root_values = {}
     self.tree = []
     stack = []
     values = {}
     matrix = None
     for event, elem in iterparse(filename, events=('start', 'end')):
         if event == 'start':
             stack.append((values, matrix))
             if matrix is not None:
                 matrix = matrix.copy()  # copy of matrix
             current_values = values
             values = {}
             values.update(current_values)  # copy of dictionary
             attrs = elem.attrib
             values.update(attrs)
             name = elem.tag[28:]
             if "style" in attrs:
                 for equate in attrs["style"].split(";"):
                     equal_item = equate.split(":")
                     values[equal_item[0]] = equal_item[1]
             if "transform" in attrs:
                 transform_matrix = parse_transform(attrs["transform"])
                 if matrix is None:
                     matrix = np.identity(3)
                 matrix = transform_matrix.dot(matrix)
             if "svg" == name:
                 current_values = values
                 values = {}
                 values.update(current_values)
                 self.root_values = current_values
                 continue
             elif "g" == name:
                 continue
             elif 'path' == name:
                 values['d'] = path2pathd(values)
             elif 'circle' == name:
                 values["d"] = ellipse2pathd(values)
             elif 'ellipse' == name:
                 values["d"] = ellipse2pathd(values)
             elif 'line' == name:
                 values["d"] = line2pathd(values)
             elif 'polyline' == name:
                 values["d"] = polyline2pathd(values['points'])
             elif 'polygon' == name:
                 values["d"] = polygon2pathd(values['points'])
             elif 'rect' == name:
                 values["d"] = rect2pathd(values)
             else:
                 continue
             values["matrix"] = matrix
             values["name"] = name
             self.tree.append(values)
         else:
             v = stack.pop()
             values = v[0]
             matrix = v[1]
def parse():
	paper_id = 0

	with open("CoAuthor.csv", "w") as f:
		w = csv.writer(f)
		w.writerow(["names"])
		for event, elem in iterparse('dblp.xml', events=['start'], parser=parser):
			if elem.tag in paper_tags():
				title = ""
				year = -1
				platform = ""

				# print "****Paper****"
				for t in elem.findall('title'):
					if valid_title(t.text):
						title = strip_comma(t.text)
						break

				for y in elem.findall('year'):
					if valid_text(y.text):
						year = y.text
						break

				for p in elem.findall('journal') or elem.findall('booktitle'):
					if valid_platform(p.text):
						platform = strip_comma(p.text)

						if platform not in dict_platform:
							dict_platform.add(platform)
						
						if valid_title(title):
							publishat.append(PublishAt(paper_id, platform, "PublishAt"))
							# publishes.append(PublishAt(platform, paper_id, "Publishes"))
					
						break

				authors = []
				for a in elem.findall('author'):
					if valid_name(a.text):
						author = a.text
						if author not in dict_researcher:
							dict_researcher.add(author)
					 	authors.append(author)

					 	if valid_title(title):
					 		authorof.append(AuthorOf(author, paper_id, "AuthorOf"))
					 		# writtenby.append(WrittenBy(paper_id, author, "WrittenBy"))

				w.writerow(authors)

				if valid_title(title):
					papers.append(Paper(paper_id, title, year, 0.0, "Paper"))
					paper_id += 1

			elem.clear()
	f.close()
Example #30
0
 def parse(self, xml_stream):
     try:
         for event, element in iterparse(xml_stream, self.events):
             if event == 'start':
                 self.startElement(element)
             else:
                 for instance in self.yieldInstances(element):
                     yield instance
     except struct.error:
         logger.exception('compressed gzip file is corrupt')
Example #31
0
def readXlsx(file, **args):
    # from: Hooshmand zandi http://stackoverflow.com/a/16544219
    import zipfile
    from xml.etree.ElementTree import iterparse

    if "sheet" in args:
        sheet = args["sheet"]
    else:
        sheet = 1
    if "header" in args:
        isHeader = args["header"]
    else:
        isHeader = False

    rows = []
    row = {}
    header = {}
    z = zipfile.ZipFile(file)

    # Get shared strings
    strings = [
        el.text for e, el in iterparse(z.open('xl/sharedStrings.xml'))
        if el.tag.endswith('}t')
    ]
    value = ''

    # Open specified worksheet
    for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml' % (sheet))):
        # get value or index to shared strings
        if el.tag.endswith('}v'):  # <v>84</v>
            value = el.text
        if el.tag.endswith('}c'):  # <c r="A3" t="s"><v>84</v></c>
            # If value is a shared string, use value as an index

            if el.attrib.get('t') == 's':
                value = strings[int(value)]

            # split the row/col information so that the row leter(s) can be
            # separate
            letter = el.attrib['r']  # AZ22
            while letter[-1].isdigit():
                letter = letter[:-1]

            # if it is the first row, then create a header hash for the names
            # that COULD be used
            if rows == []:
                header[letter] = value.strip()
            else:
                if value != '':

                    # if there is a header row, use the first row's names as
                    # the row hash index
                    if isHeader == True and letter in header:
                        row[header[letter]] = value
                    else:
                        row[letter] = value

            value = ''
        if el.tag.endswith('}row'):
            rows.append(row)
            row = {}
    z.close()
    return [header, rows]
Example #32
0
#!/home/jinho93/miniconda3/bin/python
from xml.etree.ElementTree import iterparse
import sh
import numpy as np
import sys
out = str(sh.perl('/home/jinho93/bin/checkforce', '-v'))
arr = [r.split() for r in out.split('\n')[:-4]]

selective = []
for event, elem in iterparse('vasprun.xml', events=('end', )):
    if 'name' in elem.attrib.keys() and elem.attrib['name'] == 'selective':
        for i in elem:
            selective.append(
                [True if r is 'T' else False for r in str(i.text).split()])
        break

for i, j in zip(selective, arr):
    if j:
        for k in range(3):
            if not i[k]:
                j[3 + k] = 'conv'
        print(' '.join(j))
Example #33
0
"LastJobStatus",
"DiskUsage" # KB
]

if len(sys.argv) != 3:
  print("Se requieren dos argumentos, el archivo que tiene el historico en XML y el archivo de salida en CSV")
  sys.exit(-1)

filenameXML = sys.argv[1]
if not os.path.isfile(filenameXML):
  print("No se encontro archivo [%s]"%(filenameXML))
  sys.exit(-1)

filenameCSV = sys.argv[2]

doc=iterparse(filenameXML,('start','end'))
csvfile = open(filenameCSV,"w")
writer = csv.DictWriter(csvfile, fieldnames = tags)
writer.writeheader()
# evita el primer encabezado
next(doc)
# Totales
numTasks = 0
totalBytesSent = 0
totalBytesRecv = 0
totalElapsedTime = 0
diskUsed = 0
# Parciales por tarea
user=""
completionDate = 0
jobCurrentStartDate = 0
Example #34
0
for zipcode, num in potholes_by_zip.most_common():
    print(zipcode, num)

from collections import Counter
potholes_by_zip = Counter()


data = parse_and_remove('potholes.xml', 'row/row')
for pothole in data:
    potholes_by_zip[pothole.findtext('zip')] += 1


for zipcode, num in potholes_by_zip.most_common():
    print(zipcode, num)

data = iterparse('potholes.xml',('start','end'))
next(data)

next(data)

next(data)

next(data)

next(data)

next(data)

next(data)

elem_stack[-2].remove(elem)
Example #35
0
    def convert_to_mm(self, inputfile,outputfile):
        """write output .mm file"""

        # Create the tree self.mm and add the map element
        self.mm = ET.Element("map",version="1.5.9")
        
        depth = 0

        # Iterate through the opml file looking for 'outline' tags
        for (event, node) in iterparse(inputfile, ['start', 'end', 'start-ns', 'end-ns']):
            
            # end of outline tag encountered
            if event == 'end':
                if node.tag=='outline':
                    # drop back a level
                    depth -= 1
                
            # start of outline tag encountered
            if event == 'start' and node.tag=='outline':
                
                #bump the depth 
                depth += 1
                
                # get the outline tags text
                # may be in the node.text field or the text attribute
                if node.text==None or node.text.strip()=='':
                    try:
                        nodetext=node.attrib['text'].strip()
                    except:
                        nodetext=''
                else:
                    nodetext=node.text.strip()
                
                # log where we're at
                print depth*' ',depth,'Added',node.tag,'text => '+nodetext+''
                
                # if at new level create a node element
                if depth > self.previous_level:
                    self.nodetree.append("")
                    attributes={}
                    attributes['TEXT']=nodetext
                    self.nodetree[depth] = ET.SubElement(self.nodetree[depth-1], "node",attrib=attributes)
                    
                    # if theres a note in the 'outline' tag ie attribute with tag '_node'
                    # create the note element
                    try:
                        # obtain note
                        node_note=node.attrib['_note']
                        
                        # remove any non ascii characters to avoid unicode problems
                        # node_note=self.removeNonAscii(node_note)
                    except:
                        # couldn't get a note for this node so set blank
                        node_note=''
                        
                    #if we have a note then add the richcontent element Freemind and Freeplane expect
                    if node_note<>'':
                            try:                                
                                # create richnote tag with note details embedded 
                                attributes={}
                                attributes['TYPE']='DETAILS'
                                note_element='<html><head></head><body>'+ \
                                        node_note + \
                                    '</body></html>'
                                self.nodetree[depth] = ET.SubElement(self.nodetree[depth], "richcontent",attrib=attributes)                                
                                
                                # inserting the note into node tag
                                # if note contains html and it is valid note is added as html
                                # 
                                # however if ElementTree rejects note due to parsing errors
                                # such as badly formed html then the exception below will be 
                                # triggered and note is added with raw 'escaped' text
                                # for example <b> is &lt:b&gt;

                                self.nodetree[depth].append(ET.fromstring(note_element))  
                                
                                # log result
                                print depth*' ','++ Added Note',node_note
                            except:
                                # ElementTree could not parse the opml note in the current outline tag
                                # so no note is added
                                
                                # note data is invalid xml so add the note data as xml CDATA tag
                                print '!!Warning: Invalid data. Note added as raw character data\nNote data=',node_note
  
                                # unescape html characters to avoid clashes with Freemind/Freeplane parsers
                                node_note=HTMLParser.unescape.__func__(HTMLParser, node_note)
                                
                                # escape utf-8 characters
                                node_note=escape(node_note).encode('ascii', 'xmlcharrefreplace')

                                # remove any non ASCII characters from note
                                # node_note=self.removeNonAscii(node_note)

                                # wrap escaped note in CDATA tag
                                # note_element='<html><head></head><body>'+ \
                                #     '<![CDATA['+ \
                                #         node_note + \
                                #     ']]>' + \
                                #     '</body></html>'
                                # 
                                note_element='<html><head></head><body>'+ \
                                        node_note + \
                                    '</body></html>'
                     
                                self.nodetree[depth].append(ET.fromstring(note_element))  
                 
                else:
                    # finished at current level so jump back a level
                    self.previous_level = depth-1

            if event == 'start' and node.tag=='title':
                # log title found
                print 'Added tag ',node.tag,'==>',node.text
                
                # add title tag as the first node
                self.nodetree[0]=ET.SubElement(self.mm, "node", attrib={'TEXT':node.text})

        # get the output data
        tree = ET.ElementTree(self.mm)
        root=tree.getroot()
        outputdata=ET.tostring(root)
        # print outputdata
        
        # create the output .mm file
        f=open(outputfile,'w')
        f.write(self.removeNonAscii(outputdata))
        f.close()
        
        return
 def convert_xml2csv(self, csv_file, xmlfile):
     csvfile = open(csv_file, 'wb')
     spamwriter = csv.writer(csvfile, dialect='excel', delimiter=';')
     spamwriter.writerow([
         'TCID', 'CASE_NAME', 'IMPORTANCE', 'STATUS', 'SUMMARY', 'STEP',
         'Result'
     ])
     for (event, node) in iterparse(xmlfile, events=['end']):
         if node.tag == "testcase":
             case_list = ['', '', '', '', '', '', '']
             steps_list = ['', '', '', '', '', '', '']
             case_list[1] = node.attrib['name']
             for child in node:
                 if child.tag == "externalid":
                     text = re.sub('\n|<p>|</p>|\t', '', str(child.text))
                     # print self.strip_tags(text)
                     TCID = self.strip_tags(text)
                 elif child.tag == "summary":
                     text = re.sub('\n|<p>|</p>|\t', '', str(child.text))
                     # print self.strip_tags(text)
                     case_list[4] = self.strip_tags(text)
                 elif child.tag == "importance":
                     # text = re.sub('\n|<p>|</p>|\t', '', str(child.text))
                     # print self.strip_tags(text)
                     case_list[2] = importance_map[
                         int(self.strip_tags(child.text)) - 1]
                 elif child.tag == "status":
                     # text = re.sub('\n|<p>|</p>|\t', '', str(child.text))
                     # print self.strip_tags(text)
                     case_list[3] = status_map[
                         int(self.strip_tags(child.text)) - 1]
                     if "steps" not in [item.tag for item in node]:
                         case_list[0] = TCID
                         spamwriter.writerow(case_list)
                         break
                 elif child.tag == "steps":
                     if len(child) > 0:
                         for i in range(len(child)):
                             if i == 0:
                                 for n in range(len(
                                         child.getchildren()[i])):
                                     if child.getchildren()[i].getchildren(
                                     )[n].text is not None:
                                         text = self.strip_tags(
                                             child.getchildren()
                                             [i].getchildren()
                                             [n].text).encode('UTF-8')
                                     else:
                                         text = ''
                                     # print text
                                     if child.getchildren()[i].getchildren(
                                     )[n].tag == 'actions':
                                         case_list[5] = text
                                     elif child.getchildren(
                                     )[i].getchildren(
                                     )[n].tag == 'expectedresults':
                                         case_list[6] = text
                                 case_list[0] = TCID
                                 spamwriter.writerow(case_list)
                             else:
                                 for n in range(len(
                                         child.getchildren()[i])):
                                     if child.getchildren()[i].getchildren(
                                     )[n].text is not None:
                                         text = self.strip_tags(
                                             child.getchildren()
                                             [i].getchildren()
                                             [n].text).encode('UTF-8')
                                     else:
                                         text = ''
                                     # print text
                                     if child.getchildren()[i].getchildren(
                                     )[n].tag == 'actions':
                                         steps_list[5] = text
                                     elif child.getchildren(
                                     )[i].getchildren(
                                     )[n].tag == 'expectedresults':
                                         steps_list[6] = text
                                 steps_list[0] = TCID
                                 spamwriter.writerow(steps_list)
     csvfile.close()
Example #37
0
def skip_exceptions(it):
    while True:
        try:
            yield next(it)
        except StopIteration:
            raise
        except Exception as e:
            logging.info(
                'Skipping iteration because of exception {}'.format(e))


try:
    count = 0
    for evt, elem in skip_exceptions(
            iterparse('pmc_result_sm.xml')):  # , events=('start', 'end')):
        if elem.tag == 'article':
            try:
                output = extract_text2(elem)

                if (len(output[0]) < 50) | (len(output[1]) < 1):
                    print("too short or abstract error")

                else:
                    count += 1
                    print('Article found. Count = ' + str(count))
                    with open("text//title_to_text.txt", 'a+') as text_file:
                        text_file.write(output[0].lower().lstrip() + '\n')

                    with open("text//abstract_to_text.txt", 'a+') as text_file:
                        text_file.write(output[1].lower() + '\n')
    def generate_page(self, document="Q", page="2r.json"):
        """Generate a single display page."""
        self.past_first_chapter_div = False
        self.past_first_ab = False
        self.column_number = 0
        self.current_main_column = 'a'
        self.current_subcolumn = None
        self.subcolumn = False
        self.waiting_for_column = []
        self.in_rubric = False

        self.app_pos = 0
        self.app_open = False

        self.choice_pos = 0
        self.choice_open = False
        self.choice_hovers = []

        self.ex_open = False
        self.ex_text = []
        self.am_open = False
        self.am_text = []
        self.amex_pos = 0

        self.abbr_open = False
        self.expan_open = False

        filename = os.path.join(self.page_location, document, page)
        with open(filename, encoding="utf-8") as file_p:
            data = json.load(file_p)
        self.column_structure = self.count_columns(data, document, page)

        if data['text']:
            cleaned = self.process_app(data['text'].replace('\n', ''),
                                       document, page)
            cleaned = self.process_choice(cleaned, document, page)
        else:
            cleaned = data['text']
        datastream = io.StringIO(cleaned)

        # iterparse is not deprecated
        # https://github.com/PyCQA/pylint/issues/947
        # pylint: disable=deprecated-method
        parser = iterparse(datastream, events=("start", "end"))
        output_text = []

        for event, element in parser:
            try:
                new_text = getattr(self, "process_%s_%s" %
                                   (event, element.tag))(element)
            except AttributeError:
                if self.debug:
                    print("Skipping %s." % element.tag)
                if element.text:
                    self.update_text(output_text, element.text)
                    #output_text += element.text
                    new_text = None
            else:
                if new_text == NO_TAIL:
                    pass
                elif new_text:
                    self.update_text(output_text, new_text)
                    #output_text += new_text
            if element.tail and event == 'end':
                if new_text != NO_TAIL:
                    self.update_text(output_text, element.tail)
                    #output_text += element.tail

        if self.expanded:
            data['html'] = ''.join(output_text)
        else:
            data['html_abbrev'] = ''.join(output_text)

        with open(filename, 'w', encoding="utf-8") as file_p:
            json.dump(data, file_p, ensure_ascii=False, indent=4)
Example #39
0
#

# Connect to DB
#

conn_string = 'host=localhost dbname=' + database + ' user='******' password='******' port=' + dbPort
#print conn_string

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

print "\n\nConnected to database", database, "on localhost"

depth = 0

for (event, node) in iterparse(cloudhistoryxmlpath, ['start', 'end']):
	#
	# Start Event in iterparse <some tag>
	if event == 'end':
		print "\n End tag", node.tag, " Previous tag: ",previous_endtag
		if node.tag == "{http://ec2.amazonaws.com/doc/"+xmlschemaversion+"/}item" and previous_endtag == "{http://ec2.amazonaws.com/doc/"+xmlschemaversion+"/}iamInstanceProfile":
			print "\n\n\n\n END instance previous end event tag",previous_endtag
			print "All instance data",reservationId \
				,ownerId,groupId		\
				,instanceId,imageId,name	\
                        	,privateDnsName,dnsName		\
                        	,keyName,amiLaunchIndex,instanceType \
                        	,launchTime, availabilityZone	\
                        	,kernelId,ramdiskId		\
                        	,privateIpAddress,ipAddress,groupName	\
				,rootDeviceType,rootDeviceName,eucanodeip,virtualizationType
Example #40
0
def extract_pages(f, filter_namespaces=False, filter_articles=None):
    """Extract pages from a MediaWiki database dump.

    Parameters
    ----------
    f : file
        File-like object.
    filter_namespaces : list of str or bool
         Namespaces that will be extracted.

    Yields
    ------
    tuple of (str or None, str, str)
        Title, text and page id.

    """
    elems = (elem for _, elem in iterparse(f, events=("end", )))

    # We can't rely on the namespace for database dumps, since it's changed
    # it every time a small modification to the format is made. So, determine
    # those from the first element we find, which will be part of the metadata,
    # and construct element paths.
    elem = next(elems)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    page_tag = "{%(ns)s}page" % ns_mapping
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    pageid_path = "./{%(ns)s}id" % ns_mapping

    for elem in elems:
        if elem.tag == page_tag:
            title = elem.find(title_path).text
            text = elem.find(text_path).text

            if filter_namespaces:
                ns = elem.find(ns_path).text
                if ns not in filter_namespaces:
                    text = None

            if filter_articles is not None:
                if not filter_articles(elem,
                                       namespace=namespace,
                                       title=title,
                                       text=text,
                                       page_tag=page_tag,
                                       text_path=text_path,
                                       title_path=title_path,
                                       ns_path=ns_path,
                                       pageid_path=pageid_path):
                    text = None

            pageid = elem.find(pageid_path).text
            yield title, text or "", pageid  # empty page will yield None

            # Prune the element tree, as per
            # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
            # except that we don't need to prune backlinks from the parent
            # because we don't use LXML.
            # We do this only for <page>s, since we need to inspect the
            # ./revision/text element. The pages comprise the bulk of the
            # file, so in practice we prune away enough.
            elem.clear()
Example #41
0
#!/usr/bin/env python
"""
Code for etsy job application. Requires python 2.5 (or ElementTree
http://effbot.org/zone/element-index.htm)

Simply execute this file to count the number of users in each city and print 
out a running total.

Bill Mill
12/13/07
http://billmill.org
"""

from urllib import urlopen
from xml.etree.ElementTree import iterparse

ids = [42346, 77290, 729]
prefix = "http://api.etsy.com/feeds/xml_user_details.php?id="
docs = [urlopen(prefix + str(id)) for id in ids]

totals = {}
for doc in docs:
    for event, elem in iterparse(doc):
        if elem.tag == "city":
            totals[elem.text] = totals.get(elem.text, 0) + 1
            print totals
Example #42
0
 def parse(self):
     # get an iterable
     context = iterparse(self.file_name, ["start", "end"])
     # turn it into an iterator
     context = iter(context)
     # get the root element
     event, root = context.next()
     state_stack = [root.tag]
     drug_id = None
     drug_type = None
     drug_id_partner = None
     current_target = None
     resource = None
     current_property = None
     target_types = set(
         map(lambda x: self.NS + x,
             ["target", "enzyme", "carrier", "transporter"]))
     target_types_plural = set(map(lambda x: x + "s", target_types))
     for (event, elem) in context:
         if event == "start":
             state_stack.append(elem.tag)
             if len(state_stack) <= 2 and elem.tag == self.NS + "drug":
                 if "type" in elem.attrib:
                     drug_type = elem.attrib["type"]
                 else:
                     drug_type = None
             elif elem.tag == self.NS + "drugbank-id":
                 if "primary" in elem.attrib and state_stack[
                         -3] == self.NS + "drugbank" and state_stack[
                             -2] == self.NS + "drug":
                     drug_id = None
                 elif len(state_stack) > 3 and state_stack[
                         -3] == self.NS + "drug-interactions" and state_stack[
                             -2] == self.NS + "drug-interaction":
                     drug_id_partner = None
             elif elem.tag == self.NS + "resource":
                 resource = None
             elif elem.tag == self.NS + "property":
                 current_property = None
             elif elem.tag in target_types:
                 if state_stack[-2] in target_types_plural:
                     current_target = None
         if event == "end":
             if len(state_stack) <= 2 and elem.tag == self.NS + "drug":
                 if "type" in elem.attrib:
                     drug_type = elem.attrib["type"]
                 else:
                     drug_type = None
             if elem.tag == self.NS + "drugbank-id":
                 if state_stack[-2] == self.NS + "drug":
                     if "primary" in elem.attrib:
                         drug_id = elem.text
                         self.drugs.add(drug_id.upper())
                         if drug_type is not None:
                             self.drug_to_type[drug_id] = drug_type
                         #print drug_id, drug_type
                 elif len(state_stack) > 3 and state_stack[
                         -3] == self.NS + "drug-interactions" and state_stack[
                             -2] == self.NS + "drug-interaction":
                     self.drug_to_interactions.setdefault(drug_id, {})
                     drug_id_partner = elem.text
                     if drug_id_partner not in self.drug_to_interactions[
                             drug_id]:
                         self.drug_to_interactions[drug_id][
                             drug_id_partner] = []
             elif elem.tag == self.NS + "name":
                 if len(state_stack
                        ) <= 3 and state_stack[-2] == self.NS + "drug":
                     self.drug_to_name[drug_id] = elem.text.strip()
                 elif state_stack[-2] == self.NS + "product" and state_stack[
                         -3] == self.NS + "products":
                     product = elem.text
                     product = product.strip().encode('ascii', 'ignore')
                     if product != "":
                         self.drug_to_products.setdefault(
                             drug_id, set()).add(product)
                 elif state_stack[
                         -2] == self.NS + "international-brand" and state_stack[
                             -3] == self.NS + "international-brands":
                     brand = elem.text
                     #idx = brand.find(" [")
                     #if idx != -1:
                     #    brand = brand[:idx]
                     brand = brand.strip().encode('ascii', 'ignore')
                     if brand != "":
                         self.drug_to_brands.setdefault(drug_id,
                                                        set()).add(brand)
                 #elif state_stack[-3] == self.NS+"targets" and state_stack[-2] == self.NS+"target":
                 elif state_stack[-2] == self.NS + "mixture" and state_stack[
                         -3] == self.NS + "mixtures":
                     mixture = elem.text
                     mixture = mixture.strip().encode('ascii', 'ignore')
                     if mixture != "":
                         self.drug_to_mixtures.setdefault(
                             drug_id, set()).add(mixture)
                 elif state_stack[-3] in target_types_plural and state_stack[
                         -2] in target_types:
                     self.target_to_name[current_target] = elem.text
             elif elem.tag == self.NS + "ingredients":
                 if state_stack[-3] == self.NS + "mixtures" and state_stack[
                         -2] == self.NS + "mixture":
                     ingredients = elem.text
                     ingredients = ingredients.strip().encode(
                         'ascii', 'ignore')
                     if ingredients != "" and mixture != "":
                         self.mixture_to_ingredients[mixture] = ingredients
             elif elem.tag == self.NS + "description":
                 if state_stack[-2] == self.NS + "drug":
                     self.drug_to_description[drug_id] = elem.text
                 if len(state_stack) > 3 and state_stack[
                         -3] == self.NS + "drug-interactions" and state_stack[
                             -2] == self.NS + "drug-interaction":
                     self.drug_to_interactions[drug_id][
                         drug_id_partner].append(elem.text)
             elif elem.tag == self.NS + "group":
                 if state_stack[-2] == self.NS + "groups":
                     self.drug_to_groups.setdefault(drug_id,
                                                    set()).add(elem.text)
             elif elem.tag == self.NS + "indication":
                 if state_stack[-2] == self.NS + "drug":
                     self.drug_to_indication.setdefault(drug_id, [])
                     self.drug_to_indication[drug_id].append(elem.text)
             elif elem.tag == self.NS + "pharmacodynamics":
                 if state_stack[-2] == self.NS + "drug":
                     self.drug_to_pharmacodynamics[drug_id] = elem.text
             elif elem.tag == self.NS + "mechanism-of-action":
                 if state_stack[-2] == self.NS + "drug":
                     self.drug_to_moa[drug_id] = elem.text
             elif elem.tag == self.NS + "toxicity":
                 if state_stack[-2] == self.NS + "drug":
                     self.drug_to_toxicity[drug_id] = elem.text
             elif elem.tag == self.NS + "synonym":
                 if state_stack[-2] == self.NS + "synonyms" and state_stack[
                         -3] == self.NS + "drug":
                     synonym = elem.text
                     idx = synonym.find(" [")
                     if idx != -1:
                         synonym = synonym[:idx]
                     synonym = synonym.strip().encode('ascii', 'ignore')
                     if synonym != "":
                         self.drug_to_synonyms.setdefault(
                             drug_id, set()).add(synonym)
             elif elem.tag == self.NS + "category":
                 if state_stack[-2] == self.NS + "categories":
                     self.drug_to_categories.setdefault(drug_id, set()).add(
                         elem.text)
             elif elem.tag == self.NS + "atc-code":
                 if state_stack[-2] == self.NS + "atc-codes":
                     self.drug_to_atc_codes.setdefault(drug_id, set()).add(
                         elem.attrib["code"])
             elif elem.tag == self.NS + "id":
                 if state_stack[-3] in target_types_plural and state_stack[
                         -2] in target_types:
                     current_target = elem.text
                     self.drug_to_target_to_values.setdefault(drug_id, {})
                     self.drug_to_target_to_values[drug_id][
                         current_target] = [state_stack[-2], False, []]
                     #print current_target
             elif elem.tag == self.NS + "action":
                 if state_stack[-3] in target_types and state_stack[
                         -2] == self.NS + "actions":
                     self.drug_to_target_to_values[drug_id][current_target][
                         2].append(elem.text)
             elif elem.tag == self.NS + "known-action":
                 if state_stack[-2] in target_types:
                     if elem.text == "yes":
                         self.drug_to_target_to_values[drug_id][
                             current_target][1] = True
                         if len(self.drug_to_target_to_values[drug_id]
                                [current_target][2]) == 0:
                             #print "Inconsistency with target action: {} {}".format(drug_id, current_target)
                             pass
             elif elem.tag == self.NS + "gene-name":
                 if state_stack[-3] in target_types and state_stack[
                         -2] == self.NS + "polypeptide":
                     self.target_to_gene[current_target] = elem.text
             elif elem.tag == self.NS + "kind":
                 if state_stack[
                         -3] == self.NS + "calculated-properties" and state_stack[
                             -2] == self.NS + "property":
                     current_property = elem.text  # InChIKey or SMILES
             elif elem.tag == self.NS + "value":
                 if state_stack[
                         -3] == self.NS + "calculated-properties" and state_stack[
                             -2] == self.NS + "property":
                     if current_property == "InChIKey":
                         inchi_key = elem.text  # strip InChIKey=
                         if inchi_key.startswith("InChIKey="):
                             inchi_key = inchi_key[len("InChIKey="):]
                         self.drug_to_inchi_key[drug_id] = inchi_key
                     if current_property == "SMILES":
                         self.drug_to_smiles[drug_id] = elem.text
             elif elem.tag == self.NS + "resource":
                 if state_stack[
                         -3] == self.NS + "external-identifiers" and state_stack[
                             -2] == self.NS + "external-identifier":
                     resource = elem.text
             elif elem.tag == self.NS + "identifier":
                 if state_stack[
                         -3] == self.NS + "external-identifiers" and state_stack[
                             -2] == self.NS + "external-identifier":
                     if state_stack[-5] in target_types and state_stack[
                             -4] == self.NS + "polypeptide":
                         if resource == "UniProtKB":
                             self.target_to_uniprot[
                                 current_target] = elem.text
                         if resource == "UniProt Accession":
                             self.target_to_uniprotentry[
                                 current_target] = elem.text
                     elif state_stack[-4] == self.NS + "drug":
                         if resource == "PubChem Compound":
                             self.drug_to_pubchem[drug_id] = elem.text
                         elif resource == "PubChem Substance":
                             self.drug_to_pubchem_substance[
                                 drug_id] = elem.text
                         elif resource == "ChEBI":
                             self.drug_to_chebi[drug_id] = elem.text
                         elif resource == "ChEMBL":
                             self.drug_to_chembl[drug_id] = elem.text
                         elif resource == "KEGG Drug":
                             self.drug_to_kegg[drug_id] = elem.text
                         elif resource == "KEGG Compound":
                             self.drug_to_kegg_compound[drug_id] = elem.text
                         elif resource == "UniProtKB":
                             self.drug_to_uniprot[drug_id] = elem.text
                         elif resource == "PharmGKB":
                             self.drug_to_pharmgkb[drug_id] = elem.text
             elem.clear()
             state_stack.pop()
     root.clear()
     return
Example #43
0
#! /usr/bin/env/python
# -*- coding:utf-8 -*-
from xml.etree.ElementTree import iterparse
import csv
import sys

writer =csv.writer(sys.stdout,quoting=csv.QUOTE_NONNUMERIC)
group_name = ''

for (event,node) in iterparse('podcasts.opml',events=['start']):
    if node.tag !='outline':
        # Ignore anything not part of the outline
        continue
    if not node.attrib.get('xmlUrl'):
        # Remember the current group
        group_name = node.attrib['text']
    else:
        # Output a podcast entry
        writer.writerow((group_name,node.attrib['text'],
                                  node.attrib['xmlUrl'],
                                  node.attrib.get('htmlUrl'),''
                                  )
                                )
Example #44
0
    def __getitem__(self, identifier):
        """
        Access the item with id 'identifier'.

        Either use linear, binary or interpolated search.

        Arguments:
            identifier (str): native id of the item to access

        Returns:
            data (str): text associated with the given identifier
        """
        #############################################################################
        # DOES NOT HOLD IF NUMBERS DONT START WITH ONE AND/OR DONT INCREASE BY ONE  #
        # TODO FIXME                                                                #
        #############################################################################

        self.file_handler.seek(0)

        spectrum = None
        if str(identifier).upper() == 'TIC':
            # print(str(identifier).upper())
            found = False
            mzmliter = iter(iterparse(self.file_handler, events=['end']))
            while found is False:
                event, element = next(mzmliter, ('STOP', 'STOP'))
                if event == 'end':
                    if element.tag.endswith('}chromatogram'):
                        if element.get('id') == 'TIC':
                            found = True
                            spectrum = spec.Chromatogram(
                                element,
                                measured_precision = 5e-6
                            )
                elif event == 'STOP':
                    raise StopIteration

        elif identifier in self.offset_dict:

            start = self.offset_dict[identifier]
            with open(self.path, 'rb') as seeker:
                seeker.seek(start[0])
                start, end = self._read_to_spec_end(seeker)
            self.file_handler.seek(start, 0)
            data     = self.file_handler.read(end - start)
            if data.startswith('<spectrum'):
                spectrum = spec.Spectrum(
                    XML(data),
                    measured_precision = 5e-6
                )
            elif data.startswith('<chromatogram'):
                spectrum = spec.Chromatogram(
                    XML(data)
                )
        elif type(identifier) == str:
            return self._search_string_identifier(
                identifier
            )
        else:
            spectrum = self._interpol_search(identifier)

        return spectrum
Example #45
0
def parse_monsters(file, out):
    m = None
    parent_tag = ''
    monster_type = ['', '', '']
    monster_passive_perception = 0
    monster_entity_name = ''
    monster_entity_text = []

    for event, elem in iterparse(file, ('start', 'end')):
        tag = elem.tag
        value = elem.text if elem.text is not None else ''

        if event == 'start':
            if tag == 'monster':
                parent_tag = tag
                m = make_monster()
            elif tag == 'trait' or tag == 'action' or tag == 'reaction' or tag == 'legendary':
                parent_tag = tag
                monster_entity_text = []
        else:
            if tag == 'monster':
                m['Type'] = monster_type[SIZE] + ' ' + monster_type[
                    TYPE] + ', ' + monster_type[ALIGNMENT]
                m['Senses'].append('passive Perception {0}'.format(
                    monster_passive_perception))
                out.append(m)
            elif tag == 'name':
                if parent_tag == 'monster':
                    m['Name'] = value
                else:
                    monster_entity_name = value
            elif tag == 'size':
                monster_type[SIZE] = size_convert[value]
            elif tag == 'type':
                monster_type[TYPE], m['Source'] = value.rsplit(', ', 1)
            elif tag == 'alignment':
                monster_type[ALIGNMENT] = value
            elif tag == 'ac':
                m['AC'] = parse_value_notes(value)
            elif tag == 'hp':
                m['HP'] = parse_value_notes(value)
            elif tag == 'speed':
                m['Speed'] = parse_array(value)
            elif tag == 'str':
                m['Abilities']['Str'] = int(value)
            elif tag == 'dex':
                m['Abilities']['Dex'] = int(value)
            elif tag == 'con':
                m['Abilities']['Con'] = int(value)
            elif tag == 'int':
                m['Abilities']['Int'] = int(value)
            elif tag == 'wis':
                m['Abilities']['Wis'] = int(value)
            elif tag == 'cha':
                m['Abilities']['Cha'] = int(value)
            elif tag == 'save':
                m['Saves'] = parse_name_modifier(value)
            elif tag == 'skill':
                m['Skills'] = parse_name_modifier(value)
            elif tag == 'resist':
                m['DamageResistances'] = parse_array(value, True)
            elif tag == 'vulnerable':
                m['DamageVulnerabilities'] = parse_array(value)
            elif tag == 'immune':
                m['DamageImmunities'] = parse_array(value, True)
            elif tag == 'conditionImmune':
                m['ConditionImmunities'] = parse_array(value)
            elif tag == 'senses':
                m['Senses'] = parse_array(value)
            elif tag == 'passive':
                monster_passive_perception = int(value)
            elif tag == 'languages':
                m['Languages'] = parse_array(value)
            elif tag == 'cr':
                m['Challenge'] = value
            elif tag == 'trait':
                parent_tag = 'monster'
                m['Traits'].append({
                    'Name':
                    monster_entity_name,
                    'Content':
                    '<br />'.join(monster_entity_text),
                    'Usage':
                    ''
                })
            elif tag == 'action':
                parent_tag = 'monster'
                m['Actions'].append({
                    'Name':
                    monster_entity_name,
                    'Content':
                    '<br />'.join(monster_entity_text),
                    'Usage':
                    ''
                })
            elif tag == 'reaction':
                parent_tag = 'monster'
                m['Reactions'].append({
                    'Name':
                    monster_entity_name,
                    'Content':
                    '<br />'.join(monster_entity_text),
                    'Usage':
                    ''
                })
            elif tag == 'legendary':
                parent_tag = 'monster'
                m['LegendaryActions'].append({
                    'Name':
                    monster_entity_name,
                    'Content':
                    '<br />'.join(monster_entity_text),
                    'Usage':
                    ''
                })
            elif tag == 'text':
                monster_entity_text.append(value)

            elem.clear()
Example #46
0
		return 0
#
# is Image In DB def ends
#


# Connect to DB
#

conn_string = 'host=localhost dbname=' + database + ' user='******' password='******' port=' + dbPort

conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

# iterparse using default end event since end,start caused None child elements
context = iterparse(cloudhistoryxmlpath, events=("start", "end"))
root = None

#for (event, node) in iterparse(cloudhistoryxmlpath, ['start', 'end']):
for event, node in context:
	if event == "start" and root is None:
		root = node	# the first element is root
        if event == 'end':
                #print "\n End tag", node.tag
                if node.tag == "{http://ec2.amazonaws.com/doc/"+xmlschemaversion+"/}item":
			if imageNotAlreadyInDb(imageId):
				insertToDb(sampledatetime,imageId \
				,imageLocation,imageState \
				,imageOwnerId,isPublic,architecture \
				,platform,imageType,name,description \
				,rootDeviceType,rootDeviceName,virtualizationType)
Example #47
0
    def parse(self, filepath, bin_size=1000000, resume=False):

        parser = iterparse(filepath, events=('end', 'start'))
        evt, root = next(parser)

        types: Tuple[str] = ('entered link', 'left link',
                             'PersonEntersVehicle', 'PersonLeavesVehicle')
        links: Dict[str:int] = {}

        leg_evts: List[Tuple[int, str, int, int]] = list()
        veh_evts: List[Tuple[int, int, int, int]] = list()
        leg_id: int = 0
        veh_id: int = 0
        time: int = 0
        bin_count: int = 0
        total_count: int = 0

        pr.print('Fetching network link data.', time=True)
        links = dict(self.database.fetch_network())
        pr.print('Network link data fetch completed.', time=True)

        if resume:
            pr.print('Finding where we left off parsing last.', time=True)
            leg_id = self.database.get_leg_count()
            veh_id = self.database.get_veh_count()
            offset = leg_id + veh_id
            pr.print(f'Skipping to event {offset} of XML file.', )
        else:
            pr.print('Resuming XML leg/vehicle event parsing.', time=True)

        pr.print(f'Event Parsing Progress',
                 progress=0,
                 persist=True,
                 replace=True,
                 frmt='bold')

        for evt, elem in parser:
            if elem.tag == 'event' and evt == 'end':
                etype = elem.attrib['type']
                if resume and etype in types:
                    bin_count += 1
                    total_count += 1
                    if bin_count >= bin_size:
                        time = int(float(elem.attrib['time']))
                        root.clear()
                        bin_count = 0
                        pr.print(f'Skipped to event {total_count}.')
                        pr.print(f'Event Parsing Progress',
                                 progress=time / 86400,
                                 persist=True,
                                 replace=True,
                                 frmt='bold')
                    if total_count == offset:
                        time = int(float(elem.attrib['time']))
                        root.clear()
                        bin_count = 0
                        resume = False
                        pr.print(f'Skipped to event {total_count}.', time=True)
                        pr.print('Event skipping complete.', time=True)
                        pr.print('Resuming XML leg/vehicle event parsing.',
                                 time=True)
                        pr.print(f'Event Parsing Progress',
                                 progress=time / 86400,
                                 persist=True,
                                 replace=True,
                                 frmt='bold')
                    continue

                if etype == 'entered link':
                    time = int(float(elem.attrib['time']))
                    leg_evts.append((leg_id, int(elem.attrib['vehicle']), None,
                                     links[elem.attrib['link']], time, 1))
                    bin_count += 1
                    leg_id += 1
                elif etype == 'left link':
                    time = int(float(elem.attrib['time']))
                    leg_evts.append((leg_id, int(elem.attrib['vehicle']), None,
                                     links[elem.attrib['link']], time, 0))
                    bin_count += 1
                    leg_id += 1
                elif etype == 'PersonEntersVehicle':
                    time = int(float(elem.attrib['time']))
                    veh_evts.append((veh_id, int(elem.attrib['vehicle']),
                                     int(elem.attrib['person']), time, 1))
                    bin_count += 1
                    veh_id += 1
                elif etype == 'PersonLeavesVehicle':
                    time = int(float(elem.attrib['time']))
                    veh_evts.append((veh_id, int(elem.attrib['vehicle']),
                                     int(elem.attrib['person']), time, 0))
                    bin_count += 1
                    veh_id += 1

                if bin_count >= bin_size:
                    total_count += bin_size
                    pr.print(f'Pushing {bin_count} events to SQL database.',
                             time=True)
                    self.database.write_leg_evts(leg_evts)
                    self.database.write_veh_evts(veh_evts)
                    root.clear()
                    leg_evts = []
                    veh_evts = []
                    bin_count = 0
                    pr.print(f'Resuming XML leg/vehicle event parsing.',
                             time=True)
                    pr.print(f'Event Parsing Progress',
                             progress=time / 86400,
                             persist=True,
                             replace=True,
                             frmt='bold')

        total_count += bin_size
        pr.print(f'Pushing {bin_count} events to SQL database.', time=True)
        self.database.write_leg_evts(leg_evts)
        self.database.write_veh_evts(veh_evts)
        pr.print(f'Event Parsing Progress',
                 progress=1,
                 persist=True,
                 replace=True,
                 frmt='bold')
        pr.push()
        pr.print('XML leg/vehicle event parsing complete.', time=True)
        pr.print(f'A total of {total_count} events were parsed.', time=True)
Example #48
0
def extract_definitions():
    # input_file = '/Users/vasanthi/Desktop/THESIS/COLL_COMPUTE/COLO2/SCRIPTS/xml_files/gcide_entries.xml'
    input_file = cache_abs_path('gcide_entries.xml')
    #tree = et.parse(input_file)
    #root = tree.getroot()
    """
	# To print the root node and it's attributes; as well as child noes and their attributes
	print "root:\n\ttag: %s\n\tattributes: %s" %(root.tag, root.attrib)
	print "\nChild tags:"
	for child in root:
		print "\n\ttag: %s\n\tattributes: %s" %(child.tag, child.attrib)
	print '______________\n'
	"""

    webster_dictionary = defaultdict(
        list
    )  # A dictionary to store the list of definitions of each of the word after they are returned.
    key_list = []
    definitions_list = []

    #for node in tree.iter():
    #for event, node in iterparse(input_file):

    # get an iterable
    #context = iterparse(input_file, events=("start", "end"))
    # turn it into an iterator
    #context = iter(context)
    # get the root element
    #event, root = context.next()
    #for event, node in context:

    for event, node in iterparse(input_file):
        #if (event == 'end' and node.tag == 'entry'):
        if (node.tag == 'entry'):
            key = node.attrib.get('key')
            #print key
            key_list.append(key)
            for child in node.iter():
                if (child.tag == 'def'):
                    definition = (''.join(itertext(child)))
                    #print definition
                    definitions_list.append(child.text)
                    #definitions = child.text
                    if not (definition == None):
                        definition = definition.replace("; as", " as")
                        definition = definition.replace("; --",
                                                        " ").strip('\n')
                        list_of_definitions = definition.split(
                            ';'
                        )  # As definitions are separated by a semi-colon in the gcide_entries.xml file
                        for each_definition in list_of_definitions:
                            # Non-Ascii characters are present in these definitions which are breaking the execution
                            # Deleting all of those non-ascii characters
                            ascii_text = ''
                            ascii_text = ascii_text + ''.join(
                                i for i in each_definition if ord(i) < 128)
                            ascii_text = ascii_text.rstrip(' ').rstrip('\n')
                            webster_dictionary[key].append(
                                ascii_text
                            )  # Each of the definition is separately appended to the dictionary
            node.clear()
            #root.clear()

    #print(len(key_list), len(definitions_list), len(webster_dictionary))
    return webster_dictionary
from xml.etree.ElementTree import iterparse

depth = 0
prefix_width = 8
prefix_dots = '.' * prefix_width
line_template = ''.join([
    '{prefix:<0.{prefix_len}}',
    '{event:<8}',
    '{suffix:<{suffix_len}} ',
    '{node.tag:<12} ',
    '{node_id}',
])

EVENT_NAMES = ['start', 'end', 'start-ns', 'end-ns']

for (event, node) in iterparse('podcasts.opml', EVENT_NAMES):
    if event == 'end':
        depth -= 1

    prefix_len = depth * 2

    print(line_template.format(
        prefix=prefix_dots,
        prefix_len=prefix_len,
        suffix='',
        suffix_len=(prefix_width - prefix_len),
        node=node,
        node_id=id(node),
        event=event,
    ))
Example #50
0

class XMLNamespaces:
    def __init__(self, **kwargs):
        self.namespaces = {}
        for name, uri in kwargs.items():
            self.register(name, uri)

    def register(self, name, uri):
        self.namespaces[name] = '{' + uri + '}'

    def __call__(self, path):
        return path.format_map(self.namespaces)


doc = parse('data/sample.xml')
ns = XMLNamespaces(html='http://www.w3.org/1999/xhtml')

e = doc.find(ns('content/{html}html'))
print(e)

text = doc.findtext(ns('content/{html}html/{html}head/{html}title'))
print(text)

print()

# you can get a bit more information about the scope of namespace processing
# if you use the iterparse() function
for evt, elem in iterparse('data/sample.xml', ('end', 'start-ns', 'end-ns')):
    print(evt, elem)
Example #51
0
def extractArticles(filename, collection, articlesNeeded=float('inf')):
    # Initialize variables.
    currentState = NOPAGE
    articleDict = {}
    skipArticle = False
    stats = {"numStored": 0, "numSkipped": 0}
    lastId = collection.find().sort([("_id", pymongo.DESCENDING)
                                     ]).limit(1)[0]["_id"]

    # Loop through every tag in the document.
    doc = iter(iterparse(filename, ('start', 'end')))
    _, root = doc.next()
    for event, elem in doc:
        if event == 'start':
            extractedTag = extractTag(elem.tag)

            # Tags informing state.
            if currentState == NOPAGE and extractedTag == "page":
                currentState += 1
            if currentState == INPAGE and extractedTag == "revision":
                currentState += 1

        elif event == 'end':
            # Parse XML end events
            extractedTag = extractTag(elem.tag)

            # Tags informing state.
            if extractedTag == "page":
                # Update stats and potentially save article.
                if skipArticle or "text" not in articleDict or "title" not in articleDict:
                    stats["numSkipped"] += 1
                else:
                    collection.insert_one(articleDict)
                    stats["numStored"] += 1
                    if stats["numStored"] >= articlesNeeded:
                        return stats

                # Report progress.
                if not skipArticle and stats["numStored"] % 1000 == 0:
                    print "Stored {} articles so far...".format(
                        stats["numStored"])
                    sys.stdout.flush()
                elif skipArticle and stats["numSkipped"] % 1000 == 0:
                    print "Skipped {} articles so far...".format(
                        stats["numSkipped"])
                    sys.stdout.flush()

                # Page ended, reset state.
                currentState = NOPAGE
                articleDict.clear()
                skipArticle = False

                # Clean memory.
                root.clear()

            elif extractedTag == "revision":
                currentState -= 1

            # Skip further processing if skipping article.
            if skipArticle:
                continue

            # Tags producing information.
            if extractedTag == "title":
                articleDict["title"] = elem.text
                if not articleDict["title"] or re.match(
                        "[^ ]*:", articleDict["title"]):
                    skipArticle = True
            elif currentState == INPAGE and extractedTag == "id":
                articleDict["_id"] = long(elem.text)
                if articleDict["_id"] <= lastId:
                    skipArticle = True
            elif extractedTag == "timestamp":
                articleDict["timestamp"] = elem.text
            elif extractedTag == "text":
                articleDict["text"] = elem.text
                if not articleDict["text"] or articleDict["text"].startswith(
                        "#REDIRECT"):
                    skipArticle = True

    # Ran out of articles, return some stats
    return stats
Example #52
0
    def run(self, config):
        pr.print('Prallocating process files and tables.', time=True)
        force = config['run']['force']
        self.create_tables('links', 'nodes', force=force)

        pr.print(f'Loading process metadata and resources.', time=True)
        network_path = config['run']['network_file']
        bin_size = config['run']['bin_size']

        if network_path.split('.')[-1] == 'gz':
            network_file = gzip.open(network_path, mode='rb')
        else:
            network_file = open(network_path, mode='rb')

        parser = iter(iterparse(network_file, events=('start', 'end')))
        evt, root = next(parser)

        links = []
        nodes = []
        count = 0

        for evt, elem in parser:
            if evt == 'start':
                if elem.tag == 'nodes':
                    pr.print('Starting road node parsing.', time=True)
                elif elem.tag == 'links':
                    pr.print(
                        f'Pushing {count % bin_size} nodes to the '
                        'database.',
                        time=True)
                    self.database.write_nodes(nodes)
                    nodes = []
                    root.clear()
                    count = 0
                    pr.print('Starting road link parsing.', time=True)
            elif evt == 'end':
                if elem.tag == 'node':
                    nodes.append((str(elem.get('id')),
                                  f'POINT({elem.get("x")} {elem.get("y")})'))
                    count += 1
                    if count % bin_size == 0:
                        pr.print(
                            f'Pushing {bin_size} nodes to '
                            'the database.',
                            time=True)
                        self.database.write_nodes(nodes)
                        nodes = []
                        root.clear()
                        pr.print(f'Continuing nodes parsing.', time=True)
                elif elem.tag == 'link':
                    links.append(
                        (str(elem.get('id')), str(elem.get('from')),
                         str(elem.get('to')), float(elem.get('length')),
                         float(elem.get('freespeed')),
                         float(elem.get('capacity')),
                         float(elem.get('permlanes')), int(elem.get('oneway')),
                         str(elem.get('modes'))))
                    count += 1
                    if count % bin_size == 0:
                        pr.print(
                            f'Pushing {bin_size} links to '
                            'the database.',
                            time=True)
                        self.database.write_links(links)
                        links = []
                        root.clear()
                        pr.print(f'Continuing link parsing.', time=True)

        if count % bin_size != 0:
            pr.print(f'Pushing {count % bin_size} links to the database.',
                     time=True)
            self.database.write_links(links)
            links = []
            root.clear()

        network_file.close()

        pr.print('Network road parsing complete.', time=True)

        if config['run']['create_idxs']:
            pr.print(f'Creating indexes for module tables.', time=True)
            self.create_idxs()
            pr.print(f'Index creation complete.', time=True)
Example #53
0
def parse(data):
    for event, elem in iterparse(data):
        if elem.tag in TAGS_I_CARE_ABOUT:
            yield elem
            elem.clear()
Example #54
0
# encoding: utf-8
#
# Copyright (c) 2010 Doug Hellmann.  All rights reserved.
#
"""Show the events encountered while processing an XML input
"""
#end_pymotw_header

from xml.etree.ElementTree import iterparse

depth = 0
prefix_width = 8
prefix_dots = '.' * prefix_width
line_template = '{prefix:<0.{prefix_len}}{event:<8}{suffix:<{suffix_len}} {node.tag:<12} {node_id}'

for (event, node) in iterparse('podcasts.opml',
                               ['start', 'end', 'start-ns', 'end-ns']):
    if event == 'end':
        depth -= 1

    prefix_len = depth * 2

    print line_template.format(
        prefix=prefix_dots,
        prefix_len=prefix_len,
        suffix='',
        suffix_len=(prefix_width - prefix_len),
        node=node,
        node_id=id(node),
        event=event,
    )
Example #55
0
    def parse(self, filepath, bin_size=100000):

        pr.print(f'Beginning XML input plan parsing from {filepath}.',
                 time=True)
        pr.print('Plan parsing progress:',
                 progress=0,
                 persist=True,
                 frmt='bold')

        # XML parser
        parser = iterparse(filepath, events=('start', 'end'))
        parser = iter(parser)
        evt, root = next(parser)

        # bin counter (total plans processed)
        bin_count = 0
        total_count = 0

        # tabular data
        plans = []
        activities = []
        routes = []

        # indexes
        agent = 0
        route = 0
        activity = 0

        # other important info
        modes = set()

        # ireate over XML tags
        for evt, elem in parser:
            if evt == 'start':
                if elem.tag == 'person':
                    agent = int(elem.attrib['id'])
                if elem.tag == 'plan':
                    if elem.attrib['selected'] != 'yes':
                        selected = False
                    else:
                        selected = True
            elif evt == 'end' and selected:
                if elem.tag == 'plan':
                    plans.append([  # PLANS
                        agent,  # agent_id
                        route + activity,  # size
                        len(modes)  # mode_count
                    ])

                    modes = set()
                    route = 0
                    activity = 0
                    bin_count += 1

                    if bin_count >= bin_size:
                        pr.print(f'Pushing {bin_count} plans to SQL server.',
                                 time=True)

                        self.database.write_plans(plans)
                        self.database.write_activities(activities)
                        self.database.write_routes(routes)

                        root.clear()
                        plans = []
                        activities = []
                        routes = []
                        total_count += bin_count
                        bin_count = 0

                        pr.print('Resuming XML input plan parsing.', time=True)
                        pr.print('Plan parsing progress:',
                                 progress=total_count / 2947013,
                                 persist=True,
                                 frmt='bold')

                elif elem.tag == 'act':
                    end_time = self.parse_time(elem.attrib['end_time'])
                    dur_time = end_time if 'dur' not in elem.attrib else self.parse_time(
                        elem.attrib['dur'])
                    act_type = self.encoding['activity'][elem.attrib['type']]

                    activities.append([  # ACTIVITIES
                        agent,  # agent_id
                        activity,  # act_index
                        end_time - dur_time,  # start_time
                        end_time,  # end_time
                        act_type,  # act_type
                        elem.attrib['x'],  # x
                        elem.attrib['y'],  # y
                        None  # maz
                    ])
                    activity += 1

                elif elem.tag == 'leg':
                    dep_time = self.parse_time(elem.attrib['dep_time'])
                    dur_time = self.parse_time(elem.attrib['trav_time'])
                    mode = self.encoding['mode'][elem.attrib['mode']]
                    modes.add(mode)

                    routes.append([  # ROUTES
                        agent,  # agent_id
                        route,  # route_index
                        dep_time,  # dep_time
                        dur_time,  # dur_time
                        mode,  # mode
                        None,  # src_maz
                        None  # term_maz
                    ])
                    route += 1

        pr.print(f'Pushing {bin_count} plans to SQL server.', time=True)
        pr.print('Plan parsing progress:',
                 progress=1,
                 persist=True,
                 frmt='bold')

        self.database.write_plans(plans)
        self.database.write_activities(activities)
        self.database.write_routes(routes)

        pr.print('Completed XML input plan parsing.', time=True)

        root.clear()
        plans = []
        activities = []
        routes = []
Example #56
0
# works
print(doc.findtext('content/{http://www.w3.org/1999/xhtml}html/head/title'))
# doesn't work
print(
    doc.findtext(
        'content/{http://www.w3.org/1999/xhtml}html/'
        '{http://www.w3.org/1999/xhtml}head/{http://www.w3.org/1999/xhtml}title'
    ))
# works


class XMLNamespaces:
    def __init__(self, **kwargs):
        self.namespaces = {}
        for name, uri in kwargs.items():
            self.register(name, uri)

    def register(self, name, uri):
        self.namespaces[name] = '{' + uri + '}'

    def __call__(self, path):
        return path.format_map(self.namespaces)


ns = XMLNamespaces(html='http://www.w3.org/1999/xhtml')
print(doc.find(ns('content/{html}html')))
print(doc.findtext(ns('content/{html}html/{html}head/{html}title')))

for evt, elem in iterparse('st7.xml', ('end', 'start-ns', 'end-ns')):
    print(evt, elem)
Example #57
0
        if roots.find('language') is not None:
            roots.remove(roots.find('language'))
            roots.getchildren().index(roots.find('description'))

        # oops,fail to insert.
        el = Element('spam')
        el.text = "this is a test"
        roots.insert(2, el)

        # ns = XMLNammespaces(html="http://purl.org/dc/elements/1.1/")
        # ht = doc.find(ns('content/{html}/html'))
        # print(ht)
        # title =doc.findtext(ns('content/{html}html/{html}head/{html}title'))
        # print(title)
        print("============read xml by iterparse============")
        for evt, elem in iterparse(filepath, ('end', 'start-ns', 'end-ns')):
            print(evt, elem)

    print("============read xml============")
    doc = parse(u)
    print(doc)

    e = doc.find('channel/link')
    print(e.get('title'))
    print("e.tag:{},e.text:{}".format(e.tag, e.text))

    print("============for loop============")
    i = 0
    for item in doc.iterfind("channel/item"):
        title = item.findtext("title")
        date = item.findtext("pubDate")