Exemple #1
0
 def search (self, terms):
     """
     Search for a set of terms, returns a list of IDs to parse, which
     is then fed to self.fetch for data retrieval.
     """
     
     import types, urllib
     from xml.dom import pulldom
     
     id_list = []
     
     try:
         if isinstance(terms, types.ListType):
             url = self.esearch_url.replace('[[TERMS]]',
                 urllib.quote_plus((' '.join([str[term] for term in terms]))))
         else:
             url = self.esearch_url.replace('[[TERMS]]', 
                 urllib.quote_plus(str(terms)))
         xmls = urllib.urlopen(url).read()
         events = pulldom.parseString(xmls)
         for event, node in events:
             if event == 'START_ELEMENT' \
                 and node.tagName == 'Id':
                 events.expandNode(node)
                 id = self._get_text(node)
                 id_list.append(id)
     except Exception, e:
         self.logger.error('Unable to search Pubmed:', e)
         self.logger.error(traceback.format_stack())
         return []
def extract(input_xml):
    """Process entire input XML document, firing on events"""
    # Start pulling; it continues automatically
    doc = pulldom.parseString(input_xml)
    output = ''
    for event, node in doc:
        # elements to ignore: xml
        if event == pulldom.START_ELEMENT and node.localName in ignore:
            continue
        # copy comments intact
        elif event == pulldom.COMMENT:
            doc.expandNode(node)
            output += node.toxml()
        # empty inline elements: pb, milestone
        elif event == pulldom.START_ELEMENT and node.localName in inlineEmpty:
            output += node.toxml()
        # non-empty inline elements: note, hi, head, l, lg, div, p, ab,
        elif event == pulldom.START_ELEMENT and node.localName in inlineContent:
            output += regexEmptyTag.sub('>', node.toxml())
        elif event == pulldom.END_ELEMENT and node.localName in inlineContent:
            output += '</' + node.localName + '>'
        elif event == pulldom.START_ELEMENT and node.localName in blockElement:
            output += '\n<' + node.localName + '>\n'
        elif event == pulldom.END_ELEMENT and node.localName in blockElement:
            output += '\n</' + node.localName + '>'
        elif event == pulldom.CHARACTERS:
            output += normalizeSpace(node.data)
        else:
            continue
    return output
Exemple #3
0
def openAPIparse(string, parser=None):
    if parser is None:
        from xml.dom import expatbuilder
        return expatbuilder.parseString(string)
    else:
        from xml.dom import pulldom
        return pulldom.parseString()
Exemple #4
0
def products_search_xml_api():

    parser = make_parser()
    parser.setFeature(feature_external_ges, True)
    try:
        document = pulldom.parseString(request.data.decode(), parser=parser)
        str_xml = ''
        for event, node in document:
            if event == pulldom.START_ELEMENT:
                exp = document.expandNode(node)
                if exp:
                    str_xml += exp
                str_xml += node.toxml()
        data = xmltodict.parse(str_xml)
        query = data.get('search').get('query')
    except (SAXException, ValueError) as e:
        return error_response(400, 'XML parse error - %s' % e)
    except Exception as e:
        return error_response(400, e)

    try:
        return jsonify([{
            'id': product.id,
            'name': product.name,
            'price': product.price,
            'description': product.description,
            'image': product.image,
            'stock': product.stock
        } for product in Product.query.filter(
            (Product.name.contains(query))
            | (Product.description.contains(query))).limit(100).all()])
    except Exception as e:
        return error_response(400, 'Malformed Query %s' % query)
Exemple #5
0
def cv_event_handler(event):
    print("Received event for bucket: {}, key: {}".format(event.bucket, event.key))
    file_name = event.key
    s3 = boto3.resource('s3')
    try:
        sleep(2)
        email = get_user_for_event(file_name)
        if email:
            docfile = s3.Object(CV_BUCKET, file_name)
            docbody = docfile.get()['Body'].read()
            doc = parseString(docbody.decode('utf-8'))
            content = ''
            for event, node in doc:
                doc.expandNode(node)
                content = node.toxml()
            try:
                print("content", content)
                cv_table = dynamo.Table(CV_TBL)
                response = cv_table.get_item(Key={'filename': file_name})
                item = response['Item']
                item['file_content'] = b64encode(content.encode()).decode('utf-8')
                cv_table.put_item(Item=item)
                app.log.debug(response)
            except Exception as e:
                print(e.message)

        else:
            raise Exception("Unable to find email")

    except Exception as e:
        print(e)
Exemple #6
0
def parseXML(stream, parser=None):
    if isinstance(stream, six.string_types):
        events = pulldom.parseString(stream, parser)
    else:
        events = pulldom.parse(stream, parser)

    document = None
    chain = []
    for event, node in events:
        if event == "START_DOCUMENT":
            chain.append(XMLNode("DOCUMENT", {}))

        elif event == "START_ELEMENT":
            node = XMLNode.fromDOMNode(node)
            if chain:
                chain[-1].children.append(node)
            chain.append(node)

        elif event == "END_ELEMENT":
            chain.pop(-1)

        elif event == "CHARACTERS":
            chain[-1].data += node.data

        elif event == "END_DOCUMENT":
            document = chain.pop(-1)
    return document or chain[0]
Exemple #7
0
def parseXML(stream, parser=None):
    if isinstance(stream, six.string_types):
        events = pulldom.parseString(stream, parser)
    else:
        events = pulldom.parse(stream, parser)

    document = None
    chain = []
    for event, node in events:
        if event == "START_DOCUMENT":
            chain.append(XMLNode("DOCUMENT", {}))

        elif event == "START_ELEMENT":
            node = XMLNode.fromDOMNode(node)
            if chain:
                chain[-1].children.append(node)
            chain.append(node)

        elif event == "END_ELEMENT":
            chain.pop(-1)

        elif event == "CHARACTERS":
            chain[-1].data += node.data

        elif event == "END_DOCUMENT":
            document = chain.pop(-1)
    return document or chain[0]
 def test_expandItem(self):
     """Ensure expandItem works as expected."""
     items = pulldom.parseString(SMALL_SAMPLE)
     for evt, item in items:
         if evt == pulldom.START_ELEMENT and item.tagName == 'title':
             items.expandNode(item)
             self.assertEqual(1, len(item.childNodes))
             break
     else:
         self.fail('No "title" element detected in SMALL_SAMPLE!')
     for evt, node in items:
         if evt == pulldom.START_ELEMENT:
             break
     self.assertEqual('hr', node.tagName,
         'expandNode did not leave DOMEventStream in the correct state.')
     items.expandNode(node)
     self.assertEqual(next(items)[0], pulldom.CHARACTERS)
     evt, node = next(items)
     self.assertEqual(node.tagName, 'p')
     items.expandNode(node)
     next(items)
     evt, node = next(items)
     self.assertEqual(node.tagName, 'html')
     with self.assertRaises(StopIteration):
         next(items)
     items.clear()
     self.assertIsNone(items.parser)
     self.assertIsNone(items.stream)
Exemple #9
0
 def test_expandItem(self):
     """Ensure expandItem works as expected."""
     items = pulldom.parseString(SMALL_SAMPLE)
     # Loop through the nodes until we get to a "title" start tag:
     for evt, item in items:
         if evt == pulldom.START_ELEMENT and item.tagName == "title":
             items.expandNode(item)
             self.assertEqual(1, len(item.childNodes))
             break
     else:
         self.fail("No \"title\" element detected in SMALL_SAMPLE!")
     # Loop until we get to the next start-element:
     for evt, node in items:
         if evt == pulldom.START_ELEMENT:
             break
     self.assertEqual("hr", node.tagName,
         "expandNode did not leave DOMEventStream in the correct state.")
     # Attempt to expand a standalone element:
     items.expandNode(node)
     self.assertEqual(next(items)[0], pulldom.CHARACTERS)
     evt, node = next(items)
     self.assertEqual(node.tagName, "p")
     items.expandNode(node)
     next(items) # Skip character data
     evt, node = next(items)
     self.assertEqual(node.tagName, "html")
     with self.assertRaises(StopIteration):
         next(items)
     items.clear()
     self.assertIsNone(items.parser)
     self.assertIsNone(items.stream)
Exemple #10
0
def _flatten(line: str) -> str:
    """Clean and flatten input

    Keyword argument:
    input -- line of poetry to process, as well-formed XML, with <stress> tags

    Convert stressed vowels to uppercase and remove stress tags
    Convert other text to lowercase
    Strip punctuation
    Normalize white space
    """
    if _XML_RE.match(line):
        in_stress = 0  # are we inside a <stress> element?
        result = []  # accumulate output string
        doc = pulldom.parseString(line)
        for event, node in doc:
            if event == pulldom.START_ELEMENT and node.localName == 'stress':
                in_stress = 1
            elif event == pulldom.END_ELEMENT and node.localName == 'stress':
                in_stress = 0
            elif event == pulldom.CHARACTERS:
                if in_stress:
                    result.append(node.data.upper())
                else:
                    result.append(node.data.lower())
        return _PUNC_RE.sub("", "".join(result))
    else:
        raise Exception(line, "is not tagged correctly")
def xxe_pulldom():
    attack = request.form['attack']
    test_string = "<!DOCTYPE doc [ " \
                  "<!ENTITY pulldom SYSTEM \"file:///tmp/marker\"> " \
                  "<!ENTITY pulldom2 SYSTEM \"http://www.google.com/marker\"> " \
                  "]>\n" \
                  "<root>\n" \
                  "<element>&pulldom;</element>\n" \
                  "<element>&pulldom2;</element>\n" \
                  "</root>\n"
    if str(attack).lower() == 'true':
        pulldom.parseString(test_string)
        result = 'PullDOM XXE Attack Attempted'
    else:
        result = ''
    return render_template('xxe_pulldom.html', result=result)
Exemple #12
0
def xxe():
    doc = parseString(request.form['xxe'])
    for event, node in doc:
        if event == START_ELEMENT and node.localName == "items":
            doc.expandNode(node)
            nodes = node.toxml()
    return render_template("index.html", nodes=nodes)
Exemple #13
0
 def test_expandItem(self):
     """Ensure expandItem works as expected."""
     items = pulldom.parseString(SMALL_SAMPLE)
     # Loop through the nodes until we get to a "title" start tag:
     for evt, item in items:
         if evt == pulldom.START_ELEMENT and item.tagName == "title":
             items.expandNode(item)
             self.assertEqual(1, len(item.childNodes))
             break
     else:
         self.fail("No \"title\" element detected in SMALL_SAMPLE!")
     # Loop until we get to the next start-element:
     for evt, node in items:
         if evt == pulldom.START_ELEMENT:
             break
     self.assertEqual("hr", node.tagName,
         "expandNode did not leave DOMEventStream in the correct state.")
     # Attempt to expand a standalone element:
     items.expandNode(node)
     self.assertEqual(next(items)[0], pulldom.CHARACTERS)
     evt, node = next(items)
     self.assertEqual(node.tagName, "p")
     items.expandNode(node)
     next(items) # Skip character data
     evt, node = next(items)
     self.assertEqual(node.tagName, "html")
     with self.assertRaises(StopIteration):
         next(items)
     items.clear()
     self.assertIsNone(items.parser)
     self.assertIsNone(items.stream)
Exemple #14
0
    def search(self, terms):
        """
        Search for a set of terms, returns a list of IDs to parse, which
        is then fed to self.fetch for data retrieval.
        """

        import types, urllib
        from xml.dom import pulldom

        id_list = []

        try:
            if isinstance(terms, types.ListType):
                url = self.esearch_url.replace(
                    '[[TERMS]]',
                    urllib.quote_plus(
                        (' '.join([str[term] for term in terms]))))
            else:
                url = self.esearch_url.replace('[[TERMS]]',
                                               urllib.quote_plus(str(terms)))
            xmls = urllib.urlopen(url).read()
            events = pulldom.parseString(xmls)
            for event, node in events:
                if event == 'START_ELEMENT' \
                    and node.tagName == 'Id':
                    events.expandNode(node)
                    id = self._get_text(node)
                    id_list.append(id)
        except Exception, e:
            self.logger.error('Unable to search Pubmed:', e)
            self.logger.error(traceback.format_stack())
            return []
Exemple #15
0
    def test_parse_semantics(self):
        """Test DOMEventStream parsing semantics."""

        items = pulldom.parseString(SMALL_SAMPLE)
        evt, node = next(items)
        # Just check the node is a Document:
        self.assertTrue(hasattr(node, "createElement"))
        self.assertEqual(pulldom.START_DOCUMENT, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("html", node.tagName)
        self.assertEqual(2, len(node.attributes))
        self.assertEqual(
            node.attributes.getNamedItem("xmlns:xdc").value,
            "http://www.xml.com/books")
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)  # Line break
        evt, node = next(items)
        # XXX - A comment should be reported here!
        # self.assertEqual(pulldom.COMMENT, evt)
        # Line break after swallowed comment:
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual("title", node.tagName)
        title_node = node
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        self.assertEqual("Introduction to XSL", node.data)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        self.assertEqual("title", node.tagName)
        self.assertTrue(title_node is node)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("hr", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        self.assertEqual("hr", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("p", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("xdc:author", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        self.assertEqual("xdc:author", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
Exemple #16
0
 def test_comment(self):
     """PullDOM does not receive "comment" events."""
     items = pulldom.parseString(SMALL_SAMPLE)
     for evt, _ in items:
         if evt == pulldom.COMMENT:
             break
     else:
         self.fail("No comment was encountered")
Exemple #17
0
 def test_comment(self):
     """PullDOM does not receive "comment" events."""
     items = pulldom.parseString(SMALL_SAMPLE)
     for evt, _ in items:
         if evt == pulldom.COMMENT:
             break
     else:
         self.fail("No comment was encountered")
Exemple #18
0
    def test_parse_semantics(self):
        """Test DOMEventStream parsing semantics."""

        items = pulldom.parseString(SMALL_SAMPLE)
        evt, node = next(items)
        # Just check the node is a Document:
        self.assertTrue(hasattr(node, "createElement"))
        self.assertEqual(pulldom.START_DOCUMENT, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("html", node.tagName)
        self.assertEqual(2, len(node.attributes))
        self.assertEqual(node.attributes.getNamedItem("xmlns:xdc").value,
              "http://www.xml.com/books")
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt) # Line break
        evt, node = next(items)
        # XXX - A comment should be reported here!
        # self.assertEqual(pulldom.COMMENT, evt)
        # Line break after swallowed comment:
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual("title", node.tagName)
        title_node = node
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        self.assertEqual("Introduction to XSL", node.data)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        self.assertEqual("title", node.tagName)
        self.assertTrue(title_node is node)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("hr", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        self.assertEqual("hr", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("p", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.START_ELEMENT, evt)
        self.assertEqual("xdc:author", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        self.assertEqual("xdc:author", node.tagName)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.CHARACTERS, evt)
        evt, node = next(items)
        self.assertEqual(pulldom.END_ELEMENT, evt)
Exemple #19
0
def fetchQuadrangle(dataset,yearMonth,resolution,sequence):

   # Format a URI
   strYearMonth = "{}-{:02d}".format(yearMonth.year,yearMonth.month)
   url = serviceURI+dataset+"/"+strYearMonth+"/"+str(resolution)+"/"+str(sequence);
   print url
   
   # Open an HTTP Request
   response = None
   try:
      response = urllib2.urlopen(url)
   except urllib2.HTTPError as e:
      return None
      
   html = None
   
   # Unpack the response
   if response.headers.get('content-encoding', '') == 'gzip':
      data = response.read()
      compressedstream = StringIO.StringIO(data)
      gzipper = gzip.GzipFile(fileobj=compressedstream)
      html = gzipper.read()
   else:
      html = response.read()
      
   # Parse the markup
   parser = sax.make_parser()
   parser.setFeature(sax.handler.feature_namespaces, 1)
   doc = pulldom.parseString(html,parser)
   
   inTable = False
   
   def textContent(parent):
      s = "";
      for n in parent.childNodes:
         if n.data != None:
            s += n.data
      return s
   
   # Process the markup as a stream and detect the table of data
   data = []
   for event, node in doc:
       if event == pulldom.START_ELEMENT and node.tagName == 'table':
          if node.getAttribute("typeof") == "IndexedTable":
             inTable = True
       if event == pulldom.END_ELEMENT and node.tagName == 'table':
          inTable = False
       if inTable and event == pulldom.START_ELEMENT and node.tagName == 'td':
          doc.expandNode(node)
          if len(node.childNodes) > 0:
             data.append(float(textContent(node)))
             
   if len(data) == 0:
      return None
   
   # Return the sequence number data object
   return {"dataset": dataset, "yearMonth": strYearMonth, "resolution" : resolution, "sequence": sequence, "data": data }
Exemple #20
0
def xxe():
    doc = parseString(request.form['xxe'])
    try:
        for event, node in doc:
            if event == START_ELEMENT and node.localName == "items":
                doc.expandNode(node)
                nodes = node.toxml()
        return render_template("index.html", nodes=nodes)
    except (UnboundLocalError, xml.sax._exceptions.SAXParseException):
        return render_template("index.html")
Exemple #21
0
def fetchQuadrangle(dataset, yearMonth, resolution, sequence):
    url = serviceURI + dataset + "/" + yearMonth + "/" + str(
        resolution) + "/" + str(sequence)
    print url
    response = None
    try:
        response = urllib2.urlopen(url)
    except urllib2.HTTPError as e:
        return None

    html = None

    if response.headers.get('content-encoding', '') == 'gzip':
        data = response.read()
        compressedstream = StringIO.StringIO(data)
        gzipper = gzip.GzipFile(fileobj=compressedstream)
        html = gzipper.read()
    else:
        html = response.read()

    parser = sax.make_parser()
    parser.setFeature(sax.handler.feature_namespaces, 1)
    doc = pulldom.parseString(html, parser)

    inTable = False

    def textContent(parent):
        s = ""
        for n in parent.childNodes:
            if n.data != None:
                s += n.data
        return s

    data = []
    for event, node in doc:
        if event == pulldom.START_ELEMENT and node.tagName == 'table':
            if node.getAttribute("typeof") == "IndexedTable":
                inTable = True
        if event == pulldom.END_ELEMENT and node.tagName == 'table':
            inTable = False
        if inTable and event == pulldom.START_ELEMENT and node.tagName == 'td':
            doc.expandNode(node)
            if len(node.childNodes) > 0:
                data.append(float(textContent(node)))

    if len(data) == 0:
        return None

    return {
        "dataset": dataset,
        "yearMonth": yearMonth,
        "resolution": resolution,
        "sequence": sequence,
        "data": data
    }
Exemple #22
0
def upload():
    if request.method == 'GET':
        return render_template("index.html")
    elif request.method == 'POST':
        file = request.files.get('file')
        data = parseString(file.read())
        content = ''
        for event, node in data:
            data.expandNode(node)
            content = node.toxml()
        return render_template("index.html", data=content)
Exemple #23
0
def XML_validator():
    doc = parseString(request.form['customers'])
    try:
        for event, node in doc:
            if event == START_ELEMENT and node.localName == "customers":
                doc.expandNode(node)
                nodes = node.toxml()
                return render_template("validator/index.html", nodes=nodes)
    except:
        return render_template("validator/index.html",
                               error="Validation failed")
    return render_template("validator/index.html", error="Validation failed")
Exemple #24
0
def make_parser(stream_or_string):
    """Create a xml.dom.pulldom parser."""
    if is_text(stream_or_string):

        # XXX: the pulldom.parseString() function doesn't seem to
        # like operating on unicode strings!

        return pulldom.parseString(str(stream_or_string))

    else:

        return pulldom.parse(stream_or_string)
 def test_parse_semantics(self):
     """Test DOMEventStream parsing semantics."""
     items = pulldom.parseString(SMALL_SAMPLE)
     evt, node = next(items)
     self.assertTrue(hasattr(node, 'createElement'))
     self.assertEqual(pulldom.START_DOCUMENT, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.START_ELEMENT, evt)
     self.assertEqual('html', node.tagName)
     self.assertEqual(2, len(node.attributes))
     self.assertEqual(node.attributes.getNamedItem('xmlns:xdc').value,
         'http://www.xml.com/books')
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     evt, node = next(items)
     self.assertEqual('title', node.tagName)
     title_node = node
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     self.assertEqual('Introduction to XSL', node.data)
     evt, node = next(items)
     self.assertEqual(pulldom.END_ELEMENT, evt)
     self.assertEqual('title', node.tagName)
     self.assertTrue(title_node is node)
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.START_ELEMENT, evt)
     self.assertEqual('hr', node.tagName)
     evt, node = next(items)
     self.assertEqual(pulldom.END_ELEMENT, evt)
     self.assertEqual('hr', node.tagName)
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.START_ELEMENT, evt)
     self.assertEqual('p', node.tagName)
     evt, node = next(items)
     self.assertEqual(pulldom.START_ELEMENT, evt)
     self.assertEqual('xdc:author', node.tagName)
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.END_ELEMENT, evt)
     self.assertEqual('xdc:author', node.tagName)
     evt, node = next(items)
     self.assertEqual(pulldom.END_ELEMENT, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.CHARACTERS, evt)
     evt, node = next(items)
     self.assertEqual(pulldom.END_ELEMENT, evt)
 def test_end_document(self):
     """PullDOM does not receive "end-document" events."""
     items = pulldom.parseString(SMALL_SAMPLE)
     for evt, node in items:
         if evt == pulldom.END_ELEMENT and node.tagName == 'html':
             break
     try:
         evt, node = next(items)
         self.assertEqual(pulldom.END_DOCUMENT, evt)
     except StopIteration:
         self.fail(
             'Ran out of events, but should have received END_DOCUMENT')
Exemple #27
0
def get_nodes_from_xml(src):
	if type(src)==str:
		events = pulldom.parseString(src)
	else:
		# file like object
		events = pulldom.parse(src)
	try:
		for (event, node) in events:
			if event == pulldom.START_ELEMENT and node.tagName == "node":			
				events.expandNode(node)
				yield node
	except Exception as e:
		print(e, file=sys.stderr)
Exemple #28
0
def make_parser(stream_or_string):
    """Create a xml.dom.pulldom parser."""

    if isinstance(stream_or_string, six.string_types):

        # XXX: the pulldom.parseString() function doesn't seem to
        # like operating on unicode strings!

        return pulldom.parseString(str(stream_or_string))

    else:

        return pulldom.parse(stream_or_string)
Exemple #29
0
 def test_end_document(self):
     """PullDOM does not receive "end-document" events."""
     items = pulldom.parseString(SMALL_SAMPLE)
     # Read all of the nodes up to and including </html>:
     for evt, node in items:
         if evt == pulldom.END_ELEMENT and node.tagName == "html":
             break
     try:
         # Assert that the next node is END_DOCUMENT:
         evt, node = next(items)
         self.assertEqual(pulldom.END_DOCUMENT, evt)
     except StopIteration:
         self.fail(
             "Ran out of events, but should have received END_DOCUMENT")
Exemple #30
0
def xxe_parse(request):
    parser = make_parser()
    parser.setFeature(feature_external_ges, True)
    doc = parseString(request.body.decode('utf-8'), parser=parser)
    for event, node in doc:
        if event == START_ELEMENT and node.tagName == 'text':
            doc.expandNode(node)
            text = node.toxml()
    startInd = text.find('>')
    endInd = text.find('<', startInd)
    text = text[startInd + 1:endInd:]
    p = comments.objects.filter(id=1).update(comment=text)

    return render(request, 'Lab/XXE/xxe_lab.html')
Exemple #31
0
 def test_end_document(self):
     """PullDOM does not receive "end-document" events."""
     items = pulldom.parseString(SMALL_SAMPLE)
     # Read all of the nodes up to and including </html>:
     for evt, node in items:
         if evt == pulldom.END_ELEMENT and node.tagName == "html":
             break
     try:
         # Assert that the next node is END_DOCUMENT:
         evt, node = next(items)
         self.assertEqual(pulldom.END_DOCUMENT, evt)
     except StopIteration:
         self.fail(
             "Ran out of events, but should have received END_DOCUMENT")
Exemple #32
0
 def injection(self):
     if request.method == 'POST':
         # Check if data is not empty, post forms has all params defined
         # which may be empty and cause unexpected behaviour
         if request.form['input_data'] != '':
             # Instanciate an XML parser allowing unsafe external
             # sources to to be parsed by xml.parseString
             parser = make_parser()
             parser.setFeature(feature_external_ges, True)
             doc = parseString(request.form['input_data'], parser=parser)
             for event, node in doc:
                 doc.expandNode(node)
                 return (node.toxml())
         else:
             return redirect(request.url)
     return render_template('xml.html')
Exemple #33
0
    def __call__(self):

        self.readXML = self.aggregateServiceDataToXML()
        doc = parseString(self.readXML)

        finalXML = StringIO()
        finalXML.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
        finalXML.write(
            "<Cloud xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:noNamespaceSchemaLocation=\""
            + XSD + "\">")

        headNodeXML = StringIO()
        workerNodeXML = StringIO()
        # Here the pulldom API is used to extract the XML nodes under any "HeadNode" tags and write them to the finalXML for XSLT processing
        for event, node in doc:
            if event == xml.dom.pulldom.START_ELEMENT:

                if node.localName == "HeadNode":
                    doc.expandNode(node)
                    tempString = node.toxml()
                    # The fancy string index [10:-11] is used to eliminate the <HeadeNode></HeadNode> tags from the output
                    headNodeXML.write(tempString[10:-11])
                if node.localName == "Node":
                    doc.expandNode(node)
                    tempString = node.toxml()
                    workerNodeXML.write(tempString)

        finalXML.write("<HeadNode>")
        # This tag is added for the "Optional Cloud Name" of the public XML schema. An 'id' attribute MUST be specified or the XSLs will remove this CloudName tag from the final XML. The 'id' is arbritrary
        finalXML.write("<CloudName id='arbitrary11235813'>" +
                       ConfigMapping[CLOUD_NAME] + "</CloudName>")
        finalXML.write(headNodeXML.getvalue())
        finalXML.write("</HeadNode>")
        finalXML.write("<WorkerNodes>")
        finalXML.write(workerNodeXML.getvalue())
        finalXML.write("</WorkerNodes>")
        finalXML.write("</Cloud>")

        # The various stylesheets are applied "serially" to the final XML to pepare it for publishing
        return self.applyStyleSheet(
            ConfigMapping[NAGIOS_LOCATION] + ATTRIBUTE_STRIP_XSL,
            self.applyStyleSheet(
                ConfigMapping[NAGIOS_LOCATION] + MERGE_NODES_XSL,
                self.applyStyleSheet(
                    ConfigMapping[NAGIOS_LOCATION] + REMOVE_DUP_XSL,
                    finalXML.getvalue())))
def cv_event_handler(event):
    print("Received event for bucket: {}, key: {}".format(
        event.bucket, event.key))
    file_name = event.key
    s3 = boto3.resource('s3')
    try:
        sleep(2)
        email = get_user_for_event(file_name)
        if email:
            docfile = s3.Object('training-cv-uploader', file_name)
            docbody = docfile.get()['Body'].read()
            doc = parseString(docbody)
            content = ''
            for event, node in doc:
                # print("Node", node.toxml())
                doc.expandNode(node)
                content = node.toxml()
                # print "content", content
            # docobj = Document(docbody)
            # all_paras = ""
            # for para in docobj.paragraphs:
            #     all_paras += para.text
            # all_paras = b64encode(all_paras)
            # print(all_paras)
            try:
                cv_table = dynamo.Table('cv_data')
                response = cv_table.get_item(Key={'filename': file_name})
                item = response['Item']
                item['file_content'] = b64encode(content)
                cv_table.put_item(Item=item)
                # response = cv_table.update_item(
                #     Key = {'filename': file_name},
                #     UpdateExpression = "set file_content = :fc",
                #     ExpressionAttributeValues = {":fc": b64encode(content)},
                #     ReturnValues = "UPDATED_NEW"
                # )
                app.log.debug(response)
            except Exception as e:
                print(e.message)

        else:
            raise Exception("Unable to find email")

    except Exception as e:
        print(e)
Exemple #35
0
    def _parse_response(self, content):
        bugs = {}
        stream = pulldom.parseString(content)
        for (event, node) in stream:
            if event == "START_ELEMENT" and node.tagName == "bug":
                stream.expandNode(node)
                error = node.getAttribute("error")
                if error:
                    raise IssueError(error)

                bugs['alias'] = node.getElementsByTagName("bug_id")[0].firstChild.data
                bugs['name'] = node.getElementsByTagName("short_desc")[0].firstChild.data
                bugs['status'] = node.getElementsByTagName("bug_status")[0].firstChild.data
                bugs['resolution'] = node.getElementsByTagName("resolution") or ""
                if bugs['resolution']:
                    bugs['resolution'] = bugs['resolution'][0].firstChild.data

        return bugs
Exemple #36
0
    def __process_event(self, eventdata):
        """
        Private method called while nmap process is running. It enables the
        library to handle specific data/events produced by nmap process.
        So far, the following events are supported:

        1. task progress: updates estimated time to completion and percentage
           done while scan is running. Could be used in combination with a
           callback function which could then handle this data while scan is
           running.
        2. nmap run: header of the scan. Usually displayed when nmap is started
        3. finished: when nmap scan ends.

        :return: True is event is known.

        :todo: handle parsing directly via NmapParser.parse()
        """
        rval = False
        try:
            edomdoc = pulldom.parseString(eventdata)
            for xlmnt, xmlnode in edomdoc:
                if xlmnt is not None and xlmnt == pulldom.START_ELEMENT:
                    if (xmlnode.nodeName == 'taskprogress' and
                            xmlnode.attributes.keys()):
                        percent_done = xmlnode.attributes['percent'].value
                        etc_done = xmlnode.attributes['etc'].value
                        self.__progress = percent_done
                        self.__etc = etc_done
                        rval = True
                    elif (xmlnode.nodeName == 'nmaprun' and
                            xmlnode.attributes.keys()):
                        self.__starttime = xmlnode.attributes['start'].value
                        self.__version = xmlnode.attributes['version'].value
                        rval = True
                    elif (xmlnode.nodeName == 'finished' and
                            xmlnode.attributes.keys()):
                        self.__endtime = xmlnode.attributes['time'].value
                        self.__elapsed = xmlnode.attributes['elapsed'].value
                        self.__summary = xmlnode.attributes['summary'].value
                        rval = True
        except:
            pass
        return rval
Exemple #37
0
    def __process_event(self, eventdata):
        """
        Private method called while nmap process is running. It enables the
        library to handle specific data/events produced by nmap process.
        So far, the following events are supported:

        1. task progress: updates estimated time to completion and percentage
           done while scan is running. Could be used in combination with a
           callback function which could then handle this data while scan is
           running.
        2. nmap run: header of the scan. Usually displayed when nmap is started
        3. finished: when nmap scan ends.

        :return: True is event is known.

        :todo: handle parsing directly via NmapParser.parse()
        """
        rval = False
        try:
            edomdoc = pulldom.parseString(eventdata)
            for xlmnt, xmlnode in edomdoc:
                if xlmnt is not None and xlmnt == pulldom.START_ELEMENT:
                    if (xmlnode.nodeName == 'taskprogress' and
                            xmlnode.attributes.keys()):
                        percent_done = xmlnode.attributes['percent'].value
                        etc_done = xmlnode.attributes['etc'].value
                        self.__progress = percent_done
                        self.__etc = etc_done
                        rval = True
                    elif (xmlnode.nodeName == 'nmaprun' and
                            xmlnode.attributes.keys()):
                        self.__starttime = xmlnode.attributes['start'].value
                        self.__version = xmlnode.attributes['version'].value
                        rval = True
                    elif (xmlnode.nodeName == 'finished' and
                            xmlnode.attributes.keys()):
                        self.__endtime = xmlnode.attributes['time'].value
                        self.__elapsed = xmlnode.attributes['elapsed'].value
                        self.__summary = xmlnode.attributes['summary'].value
                        rval = True
        except:
            pass
        return rval
Exemple #38
0
    def _fetchhead(self):
        """
        Fetches the head information. If there are no variables in the
        <head>, then we also fetch the boolean result.
        """
        self.events = pulldom.parseString(self.__xml)

        for (event, node) in self.events:
            if event == pulldom.START_ELEMENT:
                if node.tagName == 'variable':
                    self.variables.append(node.attributes['name'].value)
                elif node.tagName == 'boolean':
                    self.events.expandNode(node)
                    self._hasResult = (node.firstChild.data == 'true')
                elif node.tagName == 'result':
                    return # We should not arrive here
            elif event == pulldom.END_ELEMENT:
                if node.tagName == 'head' and self.variables:
                    return
                elif node.tagName == 'sparql':
                    return
Exemple #39
0
    def _fetchhead(self):
        """
        Fetches the head information. If there are no variables in the
        <head>, then we also fetch the boolean result.
        """
        self.events = pulldom.parseString(self.__xml)

        for (event, node) in self.events:
            if event == pulldom.START_ELEMENT:
                if node.tagName == 'variable':
                    self.variables.append(node.attributes['name'].value)
                elif node.tagName == 'boolean':
                    self.events.expandNode(node)
                    self._hasResult = (node.firstChild.data == 'true')
                elif node.tagName == 'result':
                    return  # We should not arrive here
            elif event == pulldom.END_ELEMENT:
                if node.tagName == 'head' and self.variables:
                    return
                elif node.tagName == 'sparql':
                    return
Exemple #40
0
def xxecomment(username, request) -> None:
    """
        parse xml unsafely (allowing external entities) and add comment to database
        username - username of PyGoat user
        request - a flask request object
    """

    parser = make_parser()
    parser.setFeature(feature_external_ges, True)
    doc = parseString(request.data.decode('utf-8'), parser=parser)
    for event, node in doc:
        if event == START_ELEMENT and node.tagName == 'text':
            doc.expandNode(node)
            text = node.toxml()
    startInd = text.find('>')
    endInd = text.find('<', startInd)
    text = text[startInd + 1:endInd:]
    conn = sqlite3.connect('pygoat.db')
    c = conn.cursor()
    c.execute('''INSERT INTO xxe_comments VALUES (?,?)''', (username, text))
    conn.commit()
    conn.close()
    def __call__(self):

        self.readXML = self.aggregateServiceDataToXML()
        doc = parseString(self.readXML)

        finalXML = StringIO()
        finalXML.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
        finalXML.write("<Cloud xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:noNamespaceSchemaLocation=\""+XSD+"\">")

        headNodeXML = StringIO()
        workerNodeXML = StringIO()
        # Here the pulldom API is used to extract the XML nodes under any "HeadNode" tags and write them to the finalXML for XSLT processing
        for event, node in doc:
            if event == xml.dom.pulldom.START_ELEMENT:
                
                if node.localName == "HeadNode":
                    doc.expandNode(node)
                    tempString = node.toxml()
                    # The fancy string index [10:-11] is used to eliminate the <HeadeNode></HeadNode> tags from the output
                    headNodeXML.write(tempString[10:-11])
                if node.localName =="Node":
                    doc.expandNode(node)
                    tempString = node.toxml()
                    workerNodeXML.write(tempString)

        finalXML.write("<HeadNode>") 
        # This tag is added for the "Optional Cloud Name" of the public XML schema. An 'id' attribute MUST be specified or the XSLs will remove this CloudName tag from the final XML. The 'id' is arbritrary
        finalXML.write("<CloudName id='arbitrary11235813'>"+ConfigMapping[CLOUD_NAME]+"</CloudName>")
        finalXML.write(headNodeXML.getvalue())
        finalXML.write("</HeadNode>")
        finalXML.write("<WorkerNodes>")
        finalXML.write(workerNodeXML.getvalue())
        finalXML.write("</WorkerNodes>")
        finalXML.write("</Cloud>")

        # The various stylesheets are applied "serially" to the final XML to pepare it for publishing 
        return self.applyStyleSheet(ConfigMapping[NAGIOS_LOCATION]+ATTRIBUTE_STRIP_XSL,self.applyStyleSheet(ConfigMapping[NAGIOS_LOCATION]+MERGE_NODES_XSL,self.applyStyleSheet(ConfigMapping[NAGIOS_LOCATION]+REMOVE_DUP_XSL,finalXML.getvalue())))
Exemple #42
0
def add_product():
    if not request.files:
        xml = request.data
    else:
        file = request.files['file']
        xml = file.read()
    data = parseString(xml)
    prod_details = dict()
    for event, node in data:
        if event == START_ELEMENT and node.tagName == 'product':
            data.expandNode(node)
            for details in node.childNodes:
                if not getattr(details, 'tagName', None):
                    continue
                prod_details[details.tagName] =\
                "".join(detail.nodeValue for detail in details.childNodes)
    product = Product(
        name=prod_details['name'],
        owner=prod_details['owner'],
        price=prod_details['price']
    )

    with psycopg2.connect(CONNECTION_DATA) as conn:
        with closing(conn.cursor()) as cur:
            cur.execute(
                "INSERT INTO product (name, price, owner) VALUES ('{NAME}', '{PRICE}', '{OWNER}')"
                .format(
                    NAME=product.name[:254],
                    PRICE=product.price[:254],
                    OWNER=product.owner[:254])
            )
            conn.commit()

    return "{NAME};{OWNER};{PRICE}".format(
        NAME=product.name, OWNER=product.owner, PRICE=product.price
    )
Exemple #43
0
 def parseString(self, st):
    self.log.debug("Setting up parser...")
    evt_stream = pulldom.parseString(st)
    self.log.debug("{}Parsing started with Parsable set {}".format('Sub-' if self.active else '', dict((p,self.__parsables[p].tag()) for p in self.__parsables)))
    for x in self._parse_evt_stream(evt_stream):
       yield x
Exemple #44
0
 def loadString(self,xml_string):
     events = pulldom.parseString(xml_string)
     self.parse(events)
def convert_xml_string_into_tokens(xml_string):
    doc = parseString(xml_string)
    return convert_xml_doc_into_tokens(doc)
Exemple #46
0
 def __init__(self, stream_or_string):
     self.event_stream = pulldom.parseString(stream_or_string)
Exemple #47
0
from xml.dom.pulldom import CHARACTERS, START_ELEMENT, parseString, END_ELEMENT

# Use djb development version of collatex (https://github.com/djbpitt/collatex, "experimental" branch)
sys.path.append('/Users/djb/collatex/collatex-pythonport/')
from collatex import *

class Stack(list):
    def push(self, item):
        self.append(item)

    def peek(self):
        return self[-1]

# Initialize input and output
source = open('pizarnik.xml','r').read()
doc = parseString(source)
witnesses = {}

# Only process content inside witnesses
inWitness = False
inLine = False

# Tokenize, keeping leading whitespace (whitespace after last token is processed separately)
def tokenize(contents):
    return re.findall(r'\s*\S+', contents)

# Regex
startWhite = re.compile(r'\s+') # strip leading whitespace; match() is automatically anchored at the start
endWhite = re.compile(r'\S\s+$') # test for trailing whitespace to include in output

for event, node in doc:
Exemple #48
0
 def __init__(self, xml):
     self._events = pulldom.parseString(xml)
Exemple #49
0
    def __process_event(self, eventdata):
        """
        Private method called while nmap process is running. It enables the
        library to handle specific data/events produced by nmap process.
        So far, the following events are supported:

        1. task progress: updates estimated time to completion and percentage
           done while scan is running. Could be used in combination with a
           callback function which could then handle this data while scan is
           running.
        2. nmap run: header of the scan. Usually displayed when nmap is started
        3. finished: when nmap scan ends.

        :return: True is event is known.

        :todo: handle parsing directly via NmapParser.parse()
        """
        rval = False
        try:
            edomdoc = pulldom.parseString(eventdata)
            for xlmnt, xmlnode in edomdoc:
                if xlmnt is not None and xlmnt == pulldom.START_ELEMENT:
                    if (xmlnode.nodeName == 'taskbegin' and
                            xmlnode.attributes.keys()):
                        xt = xmlnode.attributes
                        taskname = xt['task'].value
                        starttime = xt['time'].value
                        xinfo = ''
                        if 'extrainfo' in xt.keys():
                            xinfo = xt['extrainfo'].value
                        newtask = NmapTask(taskname, starttime, xinfo)
                        self.__nmap_tasks[newtask.name] = newtask
                        self.__current_task = newtask.name
                        rval = True
                    elif (xmlnode.nodeName == 'taskend' and
                            xmlnode.attributes.keys()):
                        xt = xmlnode.attributes
                        tname = xt['task'].value
                        xinfo = ''
                        self.__nmap_tasks[tname].endtime = xt['time'].value
                        if 'extrainfo' in xt.keys():
                            xinfo = xt['extrainfo'].value
                        self.__nmap_tasks[tname].extrainfo = xinfo
                        self.__nmap_tasks[tname].status = "ended"
                        rval = True
                    elif (xmlnode.nodeName == 'taskprogress' and
                            xmlnode.attributes.keys()):
                        xt = xmlnode.attributes
                        tname = xt['task'].value
                        percent = xt['percent'].value
                        etc = xt['etc'].value
                        remaining = xt['remaining'].value
                        updated = xt['time'].value
                        self.__nmap_tasks[tname].percent = percent
                        self.__nmap_tasks[tname].progress = percent
                        self.__nmap_tasks[tname].etc = etc
                        self.__nmap_tasks[tname].remaining = remaining
                        self.__nmap_tasks[tname].updated = updated
                        rval = True
                    elif (xmlnode.nodeName == 'nmaprun' and
                            xmlnode.attributes.keys()):
                        self.__starttime = xmlnode.attributes['start'].value
                        self.__version = xmlnode.attributes['version'].value
                        rval = True
                    elif (xmlnode.nodeName == 'finished' and
                            xmlnode.attributes.keys()):
                        self.__endtime = xmlnode.attributes['time'].value
                        self.__elapsed = xmlnode.attributes['elapsed'].value
                        self.__summary = xmlnode.attributes['summary'].value
                        rval = True
        except:
            pass
        return rval
Exemple #50
0
 def test_external_ges_default(self):
     parser = pulldom.parseString(SMALL_SAMPLE)
     saxparser = parser.parser
     ges = saxparser.getFeature(feature_external_ges)
     self.assertEqual(ges, False)
Exemple #51
0
 def test_getitem_deprecation(self):
     parser = pulldom.parseString(SMALL_SAMPLE)
     with self.assertWarnsRegex(DeprecationWarning,
                                r'Use iterator protocol instead'):
         # This should have returned 'END_ELEMENT'.
         self.assertEqual(parser[-1][0], pulldom.START_DOCUMENT)
Exemple #52
0
def buscarcep(cep):
    """
    Localiza o CEP informado no argumento utilizando o serviço
    disponibilizado pelo site www.buscarcep.com.br. Retorna um
    dicionário contendo as informações obtidas. As chaves retornadas
    são: 'cep', 'uf', 'cidade', 'bairro', 'tipo_logradouro', e
    'logradouro'.
 
    Para avaliar o resultado do retorno, verifique as chaves
    'resultado' e 'resultado_txt'. Para maiores detalhes consulte o
    site do serviço em www.buscarcep.com.br.
    """

    url = urllib.urlopen("http://www.buscarcep.com.br/?cep=" + cep + "&formato=xml")

    cepinfo = {
        "cep": "",
        "uf": "",
        "cidade": "",
        "bairro": "",
        "tipo_logradouro": "",
        "logradouro": "",
        "resultado": 0,
        "resultado_txt": "",
    }

    if url:
        texto = url.read()
        url.close()

        events = pulldom.parseString(texto)
        xpath = ""

        for event, node in events:
            if event == pulldom.START_ELEMENT:
                xpath += "/" + node.nodeName

            elif event == pulldom.END_ELEMENT:
                pos = xpath.rfind("/")
                xpath = xpath[0:pos]

            elif event == pulldom.CHARACTERS:
                if xpath == "/webservicecep/retorno/cep":
                    cepinfo["cep"] = node.nodeValue

                elif xpath == "/webservicecep/retorno/uf":
                    cepinfo["uf"] = node.nodeValue

                elif xpath == "/webservicecep/retorno/cidade":
                    cepinfo["cidade"] = node.nodeValue

                elif xpath == "/webservicecep/retorno/bairro":
                    cepinfo["bairro"] = node.nodeValue

                elif xpath == "/webservicecep/retorno/tipo_logradouro":
                    cepinfo["tipo_logradouro"] = node.nodeValue

                elif xpath == "/webservicecep/retorno/logradouro":
                    cepinfo["logradouro"] = node.nodeValue

                elif xpath == "/webservicecep/retorno/resultado":
                    cepinfo["resultado"] = int(node.nodeValue)

                elif xpath == "/webservicecep/retorno/resultado_txt":
                    cepinfo["resultado_txt"] = node.nodeValue

    else:
        # erro na conexão
        cepinfo["resultado"] = 0
        cepinfo["resultado_txt"] = "Erro na conexão"

    return [cepinfo]