def fields(self, flat=False): data = {} resp = requests.get(self.fields_url) doc = XML(resp.text) if flat: for elem in doc.iter('variable'): data[elem.attrib['name']] = "%s: %s" % (elem.attrib['concept'], elem.text) else: for concept_elem in doc.iter('concept'): concept = concept_elem.attrib['name'] variables = {} for variable_elem in concept_elem.iter('variable'): variables[variable_elem.attrib['name']] = variable_elem.text data[concept] = variables return data
def fields(self, year, flat=False): data = {} fields_url = DEFINITIONS[self.dataset].get(str(year)) if not fields_url: raise CensusException('%s is not available for %s' % (self.dataset, year)) resp = requests.get(fields_url) doc = XML(resp.text) if flat: for elem in doc.iter('variable'): data[elem.attrib['name']] = "%s: %s" % (elem.attrib['concept'], elem.text) else: for concept_elem in doc.iter('concept'): concept = concept_elem.attrib['name'] variables = {} for variable_elem in concept_elem.iter('variable'): variables[variable_elem.attrib['name']] = variable_elem.text data[concept] = variables return data
def xlsx2text(path): document = zipfile.ZipFile(path) tree = XML(document.read('xl/sharedStrings.xml')) content = [] for strings in tree.iter(STIN): texts = [node.text for node in strings.iter(TEXT) if node.text] if texts: content.append(''.join(texts)) idx = 1 while idx > 0: sheet = 'xl/worksheets/sheet' + str(idx) + '.xml' if sheet not in document.namelist(): break tree = XML(document.read(sheet)) for row in tree.iter(ROW): for cell in row.iter(CELL): functs = [node.text for node in cell.iter(FUNCT) if node.text] if functs: values = [ node.text for node in cell.iter(VALUE) if node.text ] content.append(''.join(functs)) if values: content.append(''.join(values)) idx = idx + 1 document.close() return '\n'.join(content)
def fetch(self): response, content = self.gwc.http.request(self.href) if response.status == 200: xml = XML(content) self.mimetypes = [mimetype.text for mimetype in xml.iter('string')] self.gridsets = [gridset.text for gridset in xml.iter('gridSetName')] self.metaWidth, self.metaHeight = [int(el.text) for el in xml.iter('int')] else: raise FailedRequestError(content)
def fetch(self): response, content = self.gwc.http.request(self.href) if response.status == 200: xml = XML(content) self.mimetypes = [mimetype.text for mimetype in xml.iter('string')] self.gridsets = [ gridset.text for gridset in xml.iter('gridSetName') ] self.metaWidth, self.metaHeight = [ int(el.text) for el in xml.iter('int') ] else: raise FailedRequestError(content)
def fetch(self): response, content = self.gwc.http.request(self.href) if response.status == 200: xml = XML(content) self.mimetypes = [mimetype.text for mimetype in xml.iter('string')] self.gridsets = [gridset.text for gridset in xml.iter('gridSetName')] wh = xml.iter('metaWidthHeight') try: els = wh.next().iter('int') self.metaWidth, self.metaHeight = [int(el.text) for el in els] except: #in case this parameters are not in the layer description self.metaWidth, self.metaHeight = 1, 1 else: raise FailedRequestError(content)
def fetch(self): response, content = self.gwc.http.request(self.href) if response.status == 200: xml = XML(content) self.mimetypes = [mimetype.text for mimetype in xml.iter('string')] self.gridsets = [gridset.text for gridset in xml.iter('gridSetName')] wh = xml.iter('metaWidthHeight') try: els = wh.next().iter('int') self.metaWidth, self.metaHeight = [int(el.text) for el in els] except: #in case this parameters are not in the layer description self.metaWidth, self.metaHeight = 1, 1 else: raise FailedRequestError(str(response) + content)
def get_session_id(): try: url = "https://login.salesforce.com/services/Soap/u/35.0" #headers = {'content-type': 'application/soap+xml'} headers = {'content-type': 'text/xml', 'soapaction': 'login'} body = """<?xml version="1.0" encoding="utf-8" ?> <env:Envelope xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:env="http://schemas.xmlsoap.org/soap/envelope/"> <env:Body> <n1:login xmlns:n1="urn:partner.soap.sforce.com"> <n1:username>[email protected]</n1:username> <n1:password>innovate@123</n1:password> </n1:login> </env:Body> </env:Envelope>""" response = requests.post(url, data=body, headers=headers) from xml.etree.ElementTree import XML data = XML(response.text) session = '' for i in data.iter(): s = i.tag if 'sessionId' in s.split('}'): print(i.text) session = i.text return session except Exception as exc: return exc
def get_docx_text(path): """Модуль извлекает текст из MS XML Word document (.docx) и превращает его строку в формате Unicode. Разработчик ядра модуля: Etienne, http://etienned.github.io/posts/extract-text-from-word-docx-simply/ Адаптировано для парсинга по docx и ReSort: Денис Лозинский """ #ниже - переменные, необходимые для docx парсинга, поскольку файлы docx представляют из себя заархивированные namespaced XML word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' para = word_namespace + 'p' text = word_namespace + 't' document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(para): texts = [node.text for node in paragraph.iter(text) if node.text] if texts: paragraphs.append(''.join(texts)) joint_text = ' '.join(paragraphs) result_list = re.findall(r'\w+', joint_text) #на выходе имеем список слов без пробелов в формате юникод result_str = ' '.join(result_list) #на выходе имеем текст в виде строки без знаков пунктуации return result_str
def __init__(self, host='localhost', port=2812, username=None, password='', https=False): if not port: port = 2812 port = int(port) self.baseurl = (https and 'https://%s:%i' or 'http://%s:%i') % (host, port) url = self.baseurl + '/_status?format=xml' req = urllib2.Request(url) if username: base64string = base64.encodestring('%s:%s' % (username, password))[:-1] authheader = "Basic %s" % base64string req.add_header("Authorization", authheader) try: handle = urllib2.urlopen(req) except urllib2.URLError as e: raise Exception(e.reason) try: response = handle.read() except: raise Exception("Error while reading") try: from xml.etree.ElementTree import XML root = XML(response) except: raise Exception("Error while converting to XML") for serv_el in root.iter('service'): serv = MonitConn.Service(self, serv_el) self[serv.name] = serv
def processes(self): url = self.url + "?Request=GetCapabilities&Service=WPS&AcceptVersions=1.0.0" headers, response = self.http.request(url, "GET") if headers.status != 200: raise Exception("Processes listing failed - %s, %s" % (headers, response)) response = response.replace("ows:", "") dom = XML(response) processes = [p.text for p in dom.iter() if "Title" in p.tag] return processes
def processes(self): url = self.url + '?Request=GetCapabilities&Service=WPS&AcceptVersions=1.0.0' headers, response = self.http.request(url, 'GET') if headers.status != 200: raise Exception('Processes listing failed - %s, %s' % (headers,response)) response = response.replace('ows:','') dom = XML(response) processes = [p.text for p in dom.iter() if 'Title' in p.tag] return processes
def processes(self): url = self.url + '?Request=GetCapabilities&Service=WPS&AcceptVersions=1.0.0' headers, response = self.http.request(url, 'GET') if headers.status != 200: raise Exception('Processes listing failed - %s, %s' % (headers, response)) response = response.replace('ows:', '') dom = XML(response) processes = [p.text for p in dom.iter() if 'Title' in p.tag] return processes
def get_feed_urls(opml): LOG.debug("parsing OPML: %s", opml) xml = XML(opml) if xml is not None: outlines = xml.iter('outline') if outlines: for outline in outlines: if outline.get('type') in ('rss', 'rss1', 'atom'): url = outline.get('xmlUrl') if url: yield url
def update(self): """ Update Monit deamon and services status. """ url = self.baseurl + '/_status?format=xml' response = requests.get(url, auth=self.auth) from xml.etree.ElementTree import XML root = XML(response.text) for serv_el in root.iter('service'): serv = Monit.Service(self, serv_el) self[serv.name] = serv
def docx2text(path): document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def get_docx_text(path): """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return paragraphs
def _sge_queued_or_running_jobs(user=None, encoding='utf-8'): """Get queued or running jobs from SGE queue system.""" command = ["qstat", "-xml"] if user is not None: command.extend(["-u", user]) try: with open(os.devnull, 'w') as shutup: xml = subprocess.check_output(command, stderr=shutup) tree = XML(xml, parser=XMLParser(encoding=encoding)) return [leaf.text for leaf in tree.iter("JB_name")] except (OSError, subprocess.CalledProcessError): # OSError is raised if the program is not installed # A CalledProcessError is raised if there is an issue during # the call of the command. This might happens if the option -xml # is not available such as on rock roll clusters which provide # a proxy to qstat whenever only SLURM is installed. return []
def _sge_queued_or_running_jobs(user=None, encoding="utf-8"): """Get queued or running jobs from SGE queue system.""" command = ["qstat", "-xml"] if user is not None: command.extend(["-u", user]) try: with open(os.devnull, "w") as shutup: xml = subprocess.check_output(command, stderr=shutup) tree = XML(xml, parser=XMLParser(encoding=encoding)) return [leaf.text for leaf in tree.iter("JB_name")] except (OSError, subprocess.CalledProcessError): # OSError is raised if the program is not installed # A CalledProcessError is raised if there is an issue during # the call of the command. This might happens if the option -xml # is not available such as on rock roll clusters which provide # a proxy to qstat whenever only SLURM is installed. return []
def update(self): """ Update Monit deamon and services status. """ url = self.baseurl + '/_status?format=xml' response = requests.get(url, auth=self.auth) from xml.etree.ElementTree import XML root = XML(response.text) for serv_el in root.iter('service'): serv = Monit.Service(self, serv_el) self[serv.name] = serv # Pendingaction occurs when a service is stopping if self[serv.name].pendingaction: time.sleep(1) return Monit.update(self) # Monitor == 2 when service in startup if self[serv.name].monitor == 2: time.sleep(1) return Monit.update(self)
def is_xml_verified(self, sld_xml_str): if not sld_xml_str: return False try: sldxml = XML(sld_xml_str) valid_url = re.compile(settings.VALID_SLD_LINKS) for elem in sldxml.iter(tag='{http://www.opengis.net/sld}OnlineResource'): if '{http://www.w3.org/1999/xlink}href' in elem.attrib: link = elem.attrib['{http://www.w3.org/1999/xlink}href'] if valid_url.match(link) is None: err_msg = "External images in your SLD file are not permitted. Please contact us if you would like your SLD images hosted on %s" % (settings.SITENAME) self.add_err_msg(err_msg) return False except ParseError, e: self.add_err_msg('Your SLD file contains invalid XML') return False
def pptx2text(path): paragraphs = [] document = zipfile.ZipFile(path) idx = 1 while idx > 0: slide = 'ppt/slides/slide' + str(idx) + '.xml' if slide not in document.namelist(): break tree = XML(document.read(slide)) for paragraph in tree.iter(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) idx = idx + 1 document.close() return '\n\n'.join(paragraphs)
def get_docx_text_in_xml(path): """ Take the path of a docx file as argument, return the parsed XML in unicode. See the structure in readme.md """ global block document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) root = ET.Element(DOC_TAG) cur_tag = root for paragraph in tree.iter(PARA): # get style of paragraph p_num = -1 for style_tag in paragraph.iter(STYLE): style = list(style_tag.attrib.values())[0] p_num = STYLES_PRIORITIZED.get(style) # priority number # get text texts = [node.text for node in paragraph.iter(TEXT) if node.text] if not texts: continue # implement this feature # we have 3 cases : style is more than previous, less and equal; # if it is less ('higher') than stop forming current block, append to previous; # if it is equal than just append the paragraph; # if it is more ('less important style') than create new child and work with it; if p_num < 5: block = ET.SubElement(root, BLOCK_TAG) block.set('style', style) block.text = str(texts[0]) else: text_block = ET.SubElement(block, BLOCK_TAG) text_block.set('style', style) text_block.text = str(texts[0]) # ET.dump(root) # for debug only return ET.tostring(root)
def is_xml_verified(self, sld_xml_str): if not sld_xml_str: return False try: sldxml = XML(sld_xml_str) valid_url = re.compile(settings.VALID_SLD_LINKS) for elem in sldxml.iter( tag='{http://www.opengis.net/sld}OnlineResource'): if '{http://www.w3.org/1999/xlink}href' in elem.attrib: link = elem.attrib['{http://www.w3.org/1999/xlink}href'] if valid_url.match(link) is None: err_msg = "External images in your SLD file are not permitted. Please contact us if you would like your SLD images hosted on %s" % ( settings.SITENAME) self.add_err_msg(err_msg) return False except ParseError, e: self.add_err_msg('Your SLD file contains invalid XML') return False
def read_docx(self): log.info("Reading: {}".format(self.initial_path)) namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' para = namespace + 'p' text = namespace + 't' document = zipfile.ZipFile(self.path) paragraphs = [] for segment in ("word/header1.xml", "word/header2.xml", "word/header3.xml", "word/document.xml"): if segment in list(document.NameToInfo.keys()): xml = document.read(segment) tree = XML(xml) for paragraph in tree.iter(para): texts = [n.text for n in paragraph.iter(text) if n.text] if texts: paragraphs.append(''.join(texts)) document.close() text = '\n'.join(paragraphs) return self.extract(text)
def fields(self, dataset, year): """ Returns the data set's available field names in a dictionary of the form: { "id1": "concept1: label1", "id2": "concept1: label2", "id3": "concept1: label3", } """ fields_url = DEFINITIONS[dataset].get(str(year)) if not fields_url: raise CensusException('%s is not available for %s' % (dataset, year)) resp = requests.get(fields_url) doc = XML(resp.text) return { elem.attrib['{http://www.w3.org/XML/1998/namespace}id']: "{0}: {1}".format( elem.attrib['concept'], elem.attrib['label']) for elem in doc.iter('{http://thedataweb.rm.census.gov/api/discovery/}var') if elem.attrib.get('concept') }
def update(self): """ Update Monit deamon and services status. """ url = self.baseurl + '/_status?format=xml' response = self.s.get(url) response.raise_for_status() from xml.etree.ElementTree import XML root = XML(response.text) # parse platform info platform = Monit.Platform(root.find('platform')) self['platform'] = platform # parse services inside response for serv_el in root.iter('service'): serv = Monit.Service(self, serv_el) self[serv.name] = serv # Pendingaction occurs when a service is stopping if self[serv.name].pendingaction: time.sleep(1) return Monit.update(self) # Monitor == 2 when service in startup if self[serv.name].monitorState == 2: time.sleep(1) return Monit.update(self)
logger.info('>>> Step 7. Making sure [%s] has a valid projection' % name) check_projection(name, gs_resource) # Step 7. Create the style and assign it to the created resource # FIXME: Put this in gsconfig.py logger.info('>>> Step 6. Creating style for [%s]' % name) publishing = cat.get_layer(name) if 'sld' in files: f = open(files['sld'], 'r') sld = f.read() f.close() try: sldxml = XML(sld) valid_url = re.compile(settings.VALID_SLD_LINKS) for elem in sldxml.iter(tag='{http://www.opengis.net/sld}OnlineResource'): if '{http://www.w3.org/1999/xlink}href' in elem.attrib: link = elem.attrib['{http://www.w3.org/1999/xlink}href'] if valid_url.match(link) is None: raise Exception(_("External images in your SLD file are not permitted. Please contact us if you would like your SLD images hosted on %s") % (settings.SITENAME)) except ParseError, e: msg =_('Your SLD file contains invalid XML') logger.warn("%s - %s" % (msg, str(e))) e.args = (msg,) try: stylename = name + "_".join([choice('qwertyuiopasdfghjklzxcvbnm0123456789') for i in range(4)]) cat.create_style(stylename, sld) #FIXME: Should we use the fully qualified typename? if (overwrite): alternate_styles = publishing._get_alternate_styles()
""" # Unzip and read an XML file usefully - actually .docx import zipfile from xml.etree.ElementTree import XML WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' DOCX_IN = 'Frankenstein_25.docx' TXT_OUT = 'Frankenstein_25_converted_from_docx.txt' with zipfile.ZipFile(DOCX_IN) as f_zip: xml_content = f_zip.read('word/document.xml') tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) with open(TXT_OUT, 'w', encoding='utf-8', newline='\n') as f_out: f_out.write('\n\n'.join(paragraphs))
def __init__(self, text): tree = XML(text) rec = tree.iter().next() if (rec is not None): self._uuid = None self.__dict__ = self._parse_element(rec)
logger.info('>>> Step 7. Making sure [%s] has a valid projection' % name) check_projection(name, gs_resource) # Step 7. Create the style and assign it to the created resource # FIXME: Put this in gsconfig.py logger.info('>>> Step 6. Creating style for [%s]' % name) publishing = cat.get_layer(name) if 'sld' in files: f = open(files['sld'], 'r') sld = f.read() f.close() try: sldxml = XML(sld) valid_url = re.compile(settings.VALID_SLD_LINKS) for elem in sldxml.iter(tag='{http://www.opengis.net/sld}OnlineResource'): if '{http://www.w3.org/1999/xlink}href' in elem.attrib: link = elem.attrib['{http://www.w3.org/1999/xlink}href'] if valid_url.match(link) is None: raise Exception(_("External images in your SLD file are not permitted. Please contact us if you would like your SLD images hosted on %s") % (settings.SITENAME)) except ParseError, e: msg =_('Your SLD file contains invalid XML') logger.warn("%s - %s" % (msg, str(e))) e.args = (msg,) else: sld = get_sld_for(publishing) if sld is not None: try: cat.create_style(name, sld)
def readfeedurl(feedurl, date=None): """ Read feed url and concat feed items titles """ date = date or datetime.date.today() # Get raw feed string from feed url try: r = requests.get(feedurl) except Exception as e: logger.error('Error reading feed url: %s' % feedurl) return '' # TODO: Check encoding... encoding = chardet.detect(r.content)['encoding'] #print(encoding) #return if encoding != 'utf-8': r.encoding = 'latin-1' else: r.encoding = 'utf-8' # Parse raw feed string to xml try: tree = XML(r.text.strip()) except ParseError as e: logger.error('Error reading feed: %s' % feedurl) return '' index = 0 feedtext = '' printable = set(string.printable) # Read rss items for node in tree.iter('item'): # Limit taken items node_date = node.find('pubDate').text node_date_pieces = node_date.split(" ") node_date_pieces = [ DAYS_MAP.get(piece, piece) for piece in node_date_pieces ] node_date_pieces = [ MONTHS_MAP.get(piece, piece) for piece in node_date_pieces ] node_date = " ".join(node_date_pieces) try: parsed_date = parse(node_date) except: print(node_date) continue if str(parsed_date.date()) != str(date): continue #if not index < take: # break # Get title text from the item node titletext = node.find('title').text.strip() # Remove shitty characters from jsp f*****g rss feeds... #titletext = ''.join(filter(lambda x: x in printable, titletext)) feedtext += titletext + '\n' index += 1 return feedtext