def extract_line(movie_selector: ElementTree) -> dict: has_data_cells = movie_selector.xpath('.//td[@class="s"]/text()') if has_data_cells: movie = {} title_cell_xpath = './/td[@class="nam"]/a' movie['title'] = movie_selector.xpath(title_cell_xpath + '/text()')[0] movie['details_link'] = movie_selector.xpath(title_cell_xpath + '/@href')[0] movie['seeds_num'] = movie_selector.xpath('.//td[@class="sl_s"]/text()')[0] movie['size'] = size_processor(movie_selector.xpath('.//td[@class="s"]/text()')[1]) details_link_parsed = urllib.parse.urlparse(movie['details_link']) movie['id'] = KINOZAL_ID_PREFIX + parse_qs(details_link_parsed.query)['id'][0] title_match = re.search(r'((\d+)-)?(\d+) сезон(ы)?: ((\d+)-)?(\d+) +сери(и|я) из (\d+)', movie['title']) # assert title_match, f'Cannot parse title {movie["title"]}' try: if title_match: first_season = title_match.group(2) movie['last_season'] = int(title_match.group(3)) movie['last_episode'] = int(title_match.group(7)) if not first_season: first_season = movie['last_season'] movie['seasons'] = list(range(int(first_season), movie['last_season'] + 1)) else: raise ValueError('Season data signature was not found') except ValueError as e: # just suppress the exception and do not return fields that we could not convert logging.error( f'Cannot extract season data from title: "{movie["title"]}": \n{e}' ) download_link_parsed = urllib.parse.urlparse('http://dl.kinozal.tv/download.php')._replace( query=details_link_parsed.query ) movie['torrent_link'] = urllib.parse.urlunparse(download_link_parsed) return movie else: return {}
def scrape(self, chamber, session): for term in self.metadata["terms"]: if session in term["sessions"]: year = term["start_year"] break self.versions_dict = self._versions_dict(year) base_bill_url = "http://data.opi.mt.gov/bills/%d/BillHtml/" % year index_page = ElementTree(lxml.html.fromstring(self.urlopen(base_bill_url))) bill_urls = [] for bill_anchor in index_page.findall("//a"): # See 2009 HB 645 if bill_anchor.text.find("govlineveto") == -1: # House bills start with H, Senate bills start with S if chamber == "lower" and bill_anchor.text.startswith("H"): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) elif chamber == "upper" and bill_anchor.text.startswith("S"): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) for bill_url in bill_urls: bill = self.parse_bill(bill_url, session, chamber) if bill: self.save_bill(bill)
def loadProject_0_1(projectFile): # Parse the XML Document as 0.1 version tree = ElementTree() tree.parse(projectFile) xmlProject = tree.getroot() # Register the namespace etree.register_namespace('netzob', PROJECT_NAMESPACE) etree.register_namespace('netzob-common', COMMON_NAMESPACE) projectID = xmlProject.get('id') projectName = xmlProject.get('name', 'none') projectCreationDate = TypeConvertor.xsdDatetime2PythonDatetime(xmlProject.get('creation_date')) projectPath = xmlProject.get('path') project = Project(projectID, projectName, projectCreationDate, projectPath) # Parse the configuration if xmlProject.find("{" + PROJECT_NAMESPACE + "}configuration") != None: projectConfiguration = ProjectConfiguration.loadProjectConfiguration(xmlProject.find("{" + PROJECT_NAMESPACE + "}configuration"), PROJECT_NAMESPACE, "0.1") project.setConfiguration(projectConfiguration) # Parse the vocabulary if xmlProject.find("{" + PROJECT_NAMESPACE + "}vocabulary") != None: projectVocabulary = Vocabulary.loadVocabulary(xmlProject.find("{" + PROJECT_NAMESPACE + "}vocabulary"), PROJECT_NAMESPACE, COMMON_NAMESPACE, "0.1", project) project.setVocabulary(projectVocabulary) # Parse the grammar if xmlProject.find("{" + PROJECT_NAMESPACE + "}grammar") != None: projectGrammar = Grammar.loadGrammar(xmlProject.find("{" + PROJECT_NAMESPACE + "}grammar"), projectVocabulary, PROJECT_NAMESPACE, "0.1") if projectGrammar != None: project.setGrammar(projectGrammar) return project
def generate_data_type_conf(supported_file_formats, data_types_destination): data_types_node = Element("datatypes") registration_node = add_child_node(data_types_node, "registration") registration_node.attrib[ "converters_path"] = "lib/galaxy/datatypes/converters" registration_node.attrib["display_path"] = "display_applications" for format_name in supported_file_formats: data_type = supported_file_formats[format_name] # add only if it's a data type that does not exist in Galaxy if data_type.galaxy_type is not None: data_type_node = add_child_node(registration_node, "datatype") # we know galaxy_extension is not None data_type_node.attrib["extension"] = data_type.galaxy_extension data_type_node.attrib["type"] = data_type.galaxy_type if data_type.mimetype is not None: data_type_node.attrib["mimetype"] = data_type.mimetype data_types_tree = ElementTree(data_types_node) data_types_tree.write(open(data_types_destination, 'w'), encoding="UTF-8", xml_declaration=True, pretty_print=True) logger.info( "Generated Galaxy datatypes_conf.xml in %s" % data_types_destination, 0)
def _append_element(self, field_name, data, node=self._root.tag): if(node==self._root.tag): self._root.append(ElementTree.Element(field,attrib={text:data})) else: for field in root.iter(node): field.append(ElementTree.Element(field,attrib={text:data})) self._tree.write('xml_renderer.xml')
def getNameOfProject(workspace, projectDirectory): projectFile = os.path.join(os.path.join(workspace.getPath(), projectDirectory), Project.CONFIGURATION_FILENAME) # verify we can open and read the file if projectFile == None: return None # is the projectFile is a file if not os.path.isfile(projectFile): logging.warn("The specified project's configuration file (" + str(projectFile) + ") is not valid : its not a file.") return None # is it readable if not os.access(projectFile, os.R_OK): logging.warn("The specified project's configuration file (" + str(projectFile) + ") is not readable.") return None # We validate the file given the schemas for xmlSchemaFile in Project.PROJECT_SCHEMAS.keys(): xmlSchemaPath = os.path.join(ResourcesConfiguration.getStaticResources(), xmlSchemaFile) # If we find a version which validates the XML, we parse with the associated function if Project.isSchemaValidateXML(xmlSchemaPath, projectFile): logging.debug("The file " + str(projectFile) + " validates the project configuration file.") tree = ElementTree() tree.parse(projectFile) xmlProject = tree.getroot() # Register the namespace etree.register_namespace('netzob', PROJECT_NAMESPACE) etree.register_namespace('netzob-common', COMMON_NAMESPACE) projectName = xmlProject.get('name', 'none') if projectName != None and projectName != 'none': return projectName else: logging.warn("The project declared in file (" + projectFile + ") is not valid") return None
def parse_bill(self, bill_url, session, chamber): bill = None bill_page = ElementTree(lxml.html.fromstring(self.urlopen(bill_url))) for anchor in bill_page.findall("//a"): if anchor.text_content().startswith("status of") or anchor.text_content().startswith( "Detailed Information (status)" ): status_url = anchor.attrib["href"].replace("\r", "").replace("\n", "") bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) elif anchor.text_content().startswith("This bill in WP"): index_url = anchor.attrib["href"] index_url = index_url[0 : index_url.rindex("/")] # this looks weird. See http://data.opi.mt.gov/bills/BillHtml/SB0002.htm for why index_url = index_url[index_url.rindex("http://") :] self.add_bill_versions(bill, index_url) if bill is None: # No bill was found. Maybe something like HB0790 in the 2005 session? # We can search for the bill metadata. page_name = bill_url.split("/")[-1].split(".")[0] bill_type = page_name[0:2] bill_number = page_name[2:] laws_year = metadata["session_details"][session]["years"][0] % 100 status_url = self.search_url_template % (laws_year, bill_type, bill_number) bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) return bill
def main(): argparser = configargparse.ArgumentParser( description="AWIS API Proof of Concept") argparser.add_argument('--key-id', required=True) argparser.add_argument('--secret-key', required=True) argparser.add_argument('--sites', required=True, nargs='+') args = argparser.parse_args() client = AwisApi(args.key_id, args.secret_key) tree = client.url_info(args.sites, "Rank", "LinksInCount", "Speed") print etree_tostring(tree) print "client ns_prefixes: ", client.NS_PREFIXES alexa_prefix = client.NS_PREFIXES['alexa'] awis_prefix = client.NS_PREFIXES['awis'] elem = tree.find('//{%s}StatusCode' % alexa_prefix) assert elem.text == 'Success' for elem_result in tree.findall('//{%s}UrlInfoResult' % awis_prefix): # print etree_tostring(elem_result) print "elem_result tag: %s, text: %s" % (elem_result.tag, elem_result.text) tree_result = ElementTree(elem_result) elem_url = tree_result.find('//{%s}DataUrl' % awis_prefix) if elem_url is not None: print "elem_url tag: %s, text: %s" % (elem_url.tag, elem_url.text) elem_metric = tree_result.find('//{%s}Rank' % awis_prefix) if elem_metric is not None: print "elem_metric tag: %s, text: %s " % (elem_metric.tag, elem_metric.text)
def retrieveMessagesFromFiles(self): # We read each file and create one message for each file self.messages = [] self.lineView.get_model().clear() for file in self.filesToBeImported: from netzob.Common.ResourcesConfiguration import ResourcesConfiguration xmlSchemaPath = os.path.join(ResourcesConfiguration.getStaticResources(), "xsds/0.1/common.xsd") # If we find a version which validates the XML, we parse with the associated function if not Workspace.isSchemaValidateXML(xmlSchemaPath, file): logging.error(_("The specified XML file {0} is not valid according to the XSD ({1}).").format(str(file), str(xmlSchemaPath))) else: logging.debug(_("XML file valid according to the XSD schema")) # Parse the XML Document as 0.1 version tree = ElementTree() tree.parse(file) xmlFile = tree.getroot() for xmlMessage in xmlFile.findall("{" + Project.COMMON_NAMESPACE + "}message"): message = AbstractMessageFactory.loadFromXML(xmlMessage, Project.COMMON_NAMESPACE, "0.1") logging.debug(_("XML String data: {0}").format(message.getStringData())) self.messages.append(message) self.lineView.get_model().append(None, [str(message.getID()), message.getType(), message.getStringData()])
def write_xml(filepath: str, tree: etree.ElementTree) -> None: """Write an XML file.""" with open(filepath, 'wb') as stream: tree.write(stream, xml_declaration=True, pretty_print=True, encoding='UTF-8')
def draw_frame_by_xml(src_pic, annot_file): if not os.path.exists(raw_image_dir + src_pic): print('No source image file') return if not os.path.exists(xml_file_dir + annot_file): print('No annotation file') return if not os.path.exists(tgt_image_dir): print('No target image folder') return tree = ElementTree() shutil.copyfile(raw_image_dir + src_pic, tgt_image_dir + src_pic) tree.parse(xml_file_dir + annot_file) root = tree.getroot() while (1): obj = root.find('object') if obj is None: break bb = obj.find('bndbox') xmin = bb.find('xmin').text ymin = bb.find('ymin').text xmax = bb.find('xmax').text ymax = bb.find('ymax').text #print(xmin + ' ' + ymin + ' ' + xmax + ' ' + ymax) draw_rectangle(xmin, ymin, xmax, ymax, tgt_image_dir + src_pic) root.remove(obj)
def flip(self, dst): #Horizontal flip size = self.root.find('size') W = int(size.find('width').text) for obj in self.root.iter('object'): print(obj.find('name').text) box = obj.find('bndbox') rect = [] rect.append(int(box.find('xmin').text)) rect.append(int(box.find('ymin').text)) rect.append(int(box.find('xmax').text)) rect.append(int(box.find('ymax').text)) #y' = y #x' = w - x tmp = rect[2] rect[2] = W - rect[0] rect[0] = W - tmp box.find('xmin').text = str(rect[0]) box.find('ymin').text = str(rect[1]) box.find('xmax').text = str(rect[2]) box.find('ymax').text = str(rect[3]) newTree = ElementTree(self.root) newTree.write(dst, pretty_print=True, xml_declaration=False)
def __init__(self, template: etree.ElementTree, global_decl: Decl): self.template = template self.locations = [ Location(location, self) for location in template.findall("location") ] self.transitions = [ Transition(transition, self) for transition in template.findall("transition") ] self.global_decl = global_decl self.declaration = Decl(values=global_decl.values) self.executedTransitions = [] if template.find("declaration") is not None: self.declaration.parse(template.find("declaration").text) self.systemName = self.name self.templateName = self.name self.usefull = True self.active = True self.StoredTemplate = copy.deepcopy(template) self.staticFuncs = dict(TimeShiftable=self.modifyTS, TimeShiftableR=self.revertTS, EV=self.modifyEV, EVR=self.revertEV, Battery=self.modifyBattery, BatteryR=self.revertBattery) self.neededVars = dict(TimeShiftable=self.tsVars, EV=self.evVars, Battery=self.batteryVars) self.dynamic = True self.previousVarValues = None self.startTime = None # used to sort EVs
def generate_tool_conf(parsed_ctds, tool_conf_destination, galaxy_tool_path, default_category): # for each category, we keep a list of models corresponding to it categories_to_tools = dict() for parsed_ctd in parsed_ctds: category = strip(parsed_ctd.ctd_model.opt_attribs.get("category", "")) if not category.strip(): category = default_category if category not in categories_to_tools: categories_to_tools[category] = [] categories_to_tools[category].append(utils.get_filename(parsed_ctd.suggested_output_file)) # at this point, we should have a map for all categories->tools toolbox_node = Element("toolbox") if galaxy_tool_path is not None and not galaxy_tool_path.strip().endswith("/"): galaxy_tool_path = galaxy_tool_path.strip() + "/" if galaxy_tool_path is None: galaxy_tool_path = "" for category, file_names in categories_to_tools.iteritems(): section_node = add_child_node(toolbox_node, "section") section_node.attrib["id"] = "section-id-" + "".join(category.split()) section_node.attrib["name"] = category for filename in file_names: tool_node = add_child_node(section_node, "tool") tool_node.attrib["file"] = galaxy_tool_path + filename toolconf_tree = ElementTree(toolbox_node) toolconf_tree.write(open(tool_conf_destination,'w'), encoding="UTF-8", xml_declaration=True, pretty_print=True) logger.info("Generated Galaxy tool_conf.xml in %s" % tool_conf_destination, 0)
def _convert_internal(parsed_ctds, **kwargs): # parse all input files into models using CTDopts (via utils) # the output is a tuple containing the model, output destination, origin file for parsed_ctd in parsed_ctds: model = parsed_ctd.ctd_model origin_file = parsed_ctd.input_file output_file = parsed_ctd.suggested_output_file if kwargs["skip_tools"] is not None and model.name in kwargs["skip_tools"]: logger.info("Skipping tool %s" % model.name, 0) continue elif kwargs["required_tools"] is not None and model.name not in kwargs["required_tools"]: logger.info("Tool %s is not required, skipping it" % model.name, 0) continue else: logger.info("Converting %s (source %s)" % (model.name, utils.get_filename(origin_file)), 0) tool = create_tool(model) write_header(tool, model) create_description(tool, model) expand_macros(tool, model, **kwargs) create_command(tool, model, **kwargs) create_inputs(tool, model, **kwargs) create_outputs(tool, model, **kwargs) create_help(tool, model) # wrap our tool element into a tree to be able to serialize it tree = ElementTree(tool) logger.info("Writing to %s" % utils.get_filename(output_file), 1) tree.write(open(output_file, 'w'), encoding="UTF-8", xml_declaration=True, pretty_print=True)
def get_info(path): e = ET(file = path) items = [] # hack, it might not always be [1]. it just is on here for f in e.getroot()[1]: items.insert(0, (f.get('name'), 'file://' + os.path.join(os.path.dirname(path), f.get('link')))) return items
def scrape(self, chamber, year): year = int(year) session = self.getSession(year) #2 year terms starting on odd year, so if even number, use the previous odd year if year < 1999: raise NoDataForPeriod(year) if year % 2 == 0: year -= 1 if year == 1999: base_bill_url = 'http://data.opi.mt.gov/bills/BillHtml/' else: base_bill_url = 'http://data.opi.mt.gov/bills/%d/BillHtml/' % year index_page = ElementTree( lxml.html.fromstring(self.urlopen(base_bill_url))) bill_urls = [] for bill_anchor in index_page.findall('//a'): # See 2009 HB 645 if bill_anchor.text.find("govlineveto") == -1: # House bills start with H, Senate bills start with S if chamber == 'lower' and bill_anchor.text.startswith('H'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) elif chamber == 'upper' and bill_anchor.text.startswith('S'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) for bill_url in bill_urls: bill = self.parse_bill(bill_url, session, chamber) self.save_bill(bill)
def parse_activity( activity: etree.ElementTree ) -> Tuple[List[backend.events.AcademicalEvent], str, str, str]: """ Parses an element from a request into a list of events and some activity information. :param activity: the activity element :type activity: etree.ElementTree :return: the events, the name, the id and the code of this activity :rtype: Tuple[List[backend.events.AcademicalEvent], str, str, str] """ activity_id = activity.attrib['name'] activity_type = activity.attrib['type'] activity_name = activity.attrib['code'] event_type = backend.events.extract_type(activity_type, activity_id) event_codes = activity.xpath( './/eventParticipant[@category="category5"]/@name') events = activity.xpath('.//event') events_list = list() if len(event_codes) == 0: activity_code = backend.events.extract_code(activity_id) else: activity_code = Counter(event_codes).most_common()[0][0] if activity_code is '': activity_code = 'Other' for event in events: events_list.append( parse_event(event, event_type, activity_name, activity_id, activity_code)) return events_list, activity_name, activity_id, activity_code
def scrape(self, chamber, year): year = int(year) session = self.getSession(year) # 2 year terms starting on odd year, so if even number, use the previous odd year if year < 1999: raise NoDataForPeriod(year) if year % 2 == 0: year -= 1 if year == 1999: base_bill_url = "http://data.opi.mt.gov/bills/BillHtml/" else: base_bill_url = "http://data.opi.mt.gov/bills/%d/BillHtml/" % year index_page = ElementTree(lxml.html.fromstring(self.urlopen(base_bill_url))) bill_urls = [] for bill_anchor in index_page.findall("//a"): # See 2009 HB 645 if bill_anchor.text.find("govlineveto") == -1: # House bills start with H, Senate bills start with S if chamber == "lower" and bill_anchor.text.startswith("H"): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) elif chamber == "upper" and bill_anchor.text.startswith("S"): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) for bill_url in bill_urls: bill = self.parse_bill(bill_url, session, chamber) self.save_bill(bill)
def parse_bill(self, bill_url, session, chamber): bill = None bill_page = ElementTree(lxml.html.fromstring(self.urlopen(bill_url))) for anchor in bill_page.findall('//a'): if (anchor.text_content().startswith('status of') or anchor.text_content().startswith( 'Detailed Information (status)')): status_url = anchor.attrib['href'].replace("\r", "").replace( "\n", "") bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) elif anchor.text_content().startswith('This bill in WP'): index_url = anchor.attrib['href'] index_url = index_url[0:index_url.rindex('/')] # this looks weird. See http://data.opi.mt.gov/bills/BillHtml/SB0002.htm for why index_url = index_url[index_url.rindex("http://"):] self.add_bill_versions(bill, index_url) if bill is None: # No bill was found. Maybe something like HB0790 in the 2005 session? # We can search for the bill metadata. page_name = bill_url.split("/")[-1].split(".")[0] bill_type = page_name[0:2] bill_number = page_name[2:] laws_year = self.metadata['session_details'][session]['years'][ 0] % 100 status_url = self.search_url_template % (laws_year, bill_type, bill_number) bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) return bill
def getdescendants(request, code): params = {} results = {} language = request.LANGUAGE_CODE.lower() if language == 'pt-br': language = 'pt' for lang in DECS_LANGS: params[lang] = urllib.urlencode({ 'tree_id': code or '', 'lang': lang, }) resource = urllib.urlopen(settings.DECS_SERVICE, params[lang]) tree = ElementTree() tree.parse(resource) descendants = tree.findall('decsws_response/tree/descendants/term_list[@lang="%s"]/term' % lang) for d in descendants: if d.attrib['tree_id'] in results: results[ d.attrib['tree_id'] ] += ',"%s":"%s"' % (lang,d.text.capitalize()) else: results[ d.attrib['tree_id'] ] = '"%s":"%s"' % (lang,d.text.capitalize()) json = '[%s]' % ','.join((JSON_MULTILINGUAL_TERM % (id,desc) for desc,id in results.items())) json_response = json_loads(json) json_response.sort(key=lambda x: x['fields']['description'][language]) return HttpResponse(json_dumps(json_response), mimetype='application/json')
def exportProjectAction(self, widget, data): chooser = gtk.FileChooserDialog(title=_("Export as (XML)"), action=gtk.FILE_CHOOSER_ACTION_SAVE, buttons=(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OPEN, gtk.RESPONSE_OK)) res = chooser.run() if res == gtk.RESPONSE_OK: fileName = chooser.get_filename() chooser.destroy() doCreateFile = False isFile = os.path.isfile(fileName) if not isFile: doCreateFile = True else: md = gtk.MessageDialog(None, gtk.DIALOG_DESTROY_WITH_PARENT, gtk.MESSAGE_QUESTION, gtk.BUTTONS_OK_CANCEL, _("Are you sure to override the file '{0}'?").format(fileName)) resp = md.run() md.destroy() if resp == gtk.RESPONSE_OK: doCreateFile = True if doCreateFile: root = self.netzob.getCurrentProject().generateXMLConfigFile() tree = ElementTree(root) tree.write(fileName) NetzobInfoMessage(_("Project correctly exported to '{0}'").format(fileName))
def save(self, filePath, setAllReferences=True): if setAllReferences: self.setAllReferences() tree = XMLElementTree(self.toXML()) indent(tree, space=" ") tree.write(filePath, xml_declaration=True, encoding="utf-8", pretty_print=True)
def loadProject_0_1(projectFile): # Parse the XML Document as 0.1 version tree = ElementTree() tree.parse(projectFile) xmlProject = tree.getroot() # Register the namespace etree.register_namespace("netzob", PROJECT_NAMESPACE) etree.register_namespace("netzob-common", COMMON_NAMESPACE) projectID = str(xmlProject.get("id")) projectName = xmlProject.get("name", "none") projectCreationDate = TypeConvertor.xsdDatetime2PythonDatetime(xmlProject.get("creation_date")) projectPath = xmlProject.get("path") project = Project(projectID, projectName, projectCreationDate, projectPath) description = xmlProject.get("description") project.setDescription(description) # Parse the configuration if xmlProject.find("{" + PROJECT_NAMESPACE + "}configuration") is not None: projectConfiguration = ProjectConfiguration.loadProjectConfiguration( xmlProject.find("{" + PROJECT_NAMESPACE + "}configuration"), PROJECT_NAMESPACE, "0.1" ) project.setConfiguration(projectConfiguration) # Parse the vocabulary if xmlProject.find("{" + PROJECT_NAMESPACE + "}vocabulary") is not None: projectVocabulary = Vocabulary.loadVocabulary( xmlProject.find("{" + PROJECT_NAMESPACE + "}vocabulary"), PROJECT_NAMESPACE, COMMON_NAMESPACE, "0.1", project, ) project.setVocabulary(projectVocabulary) # Parse the grammar if xmlProject.find("{" + PROJECT_NAMESPACE + "}grammar") is not None: projectGrammar = Grammar.loadGrammar( xmlProject.find("{" + PROJECT_NAMESPACE + "}grammar"), projectVocabulary, PROJECT_NAMESPACE, "0.1" ) if projectGrammar is not None: project.setGrammar(projectGrammar) # Parse the simulator if xmlProject.find("{" + PROJECT_NAMESPACE + "}simulator") is not None: projectSimulator = Simulator.loadSimulator( xmlProject.find("{" + PROJECT_NAMESPACE + "}simulator"), PROJECT_NAMESPACE, "0.1", project.getGrammar().getAutomata(), project.getVocabulary(), ) if projectSimulator is not None: project.setSimulator(projectSimulator) return project
def scrape(self, chamber, session): for term in self.metadata['terms']: if session in term['sessions']: year = term['start_year'] break self.versions_dict = self._versions_dict(year) base_bill_url = 'http://data.opi.mt.gov/bills/%d/BillHtml/' % year index_page = ElementTree( lxml.html.fromstring(self.urlopen(base_bill_url))) bill_urls = [] for bill_anchor in index_page.findall('//a'): # See 2009 HB 645 if bill_anchor.text.find("govlineveto") == -1: # House bills start with H, Senate bills start with S if chamber == 'lower' and bill_anchor.text.startswith('H'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) elif chamber == 'upper' and bill_anchor.text.startswith('S'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) for bill_url in bill_urls: bill = self.parse_bill(bill_url, session, chamber) if bill: self.save_bill(bill)
def scrape(self, chamber, session): for term in self.metadata['terms']: if session in term['sessions']: year = term['start_year'] break self.versions_dict = self._versions_dict(year) base_bill_url = 'http://leg.mt.gov/bills/%d/BillHtml/' % year index_page = ElementTree(lxml.html.fromstring(self.get(base_bill_url).text)) bill_urls = [] for bill_anchor in index_page.findall('//a'): # See 2009 HB 645 if bill_anchor.text.find("govlineveto") == -1: # House bills start with H, Senate bills start with S if chamber == 'lower' and bill_anchor.text.startswith('H'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) elif chamber == 'upper' and bill_anchor.text.startswith('S'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) for bill_url in bill_urls: bill = self.parse_bill(bill_url, session, chamber) if bill: self.save_bill(bill)
def __str__ (self) : tree = ElementTree () tree._setroot (Element ('Header')) r = tree.getroot () for e in self.header ('HUHU') : r.append (e) return tostring (tree, pretty_print = True, encoding = 'unicode')
def saveConfigFile(self, overrideTraces=[]): """This functions allows to save the current (and only) instance of the Workspace. You can supply a list of traces that should be written on-disk through the `overrideTraces` variable. This allows to override specific traces that where modified. :param overrideTraces: a list of trace identifiers that should be written on-disk, even if they already exists. """ workspaceFile = os.path.join(self.path, Workspace.CONFIGURATION_FILENAME) logging.info("Save the config file of the workspace {0} in {1}".format(self.getName(), workspaceFile)) # Register the namespace etree.register_namespace('netzob', WORKSPACE_NAMESPACE) etree.register_namespace('netzob-common', COMMON_NAMESPACE) # Dump the file root = etree.Element("{" + WORKSPACE_NAMESPACE + "}workspace") root.set("creation_date", TypeConvertor.pythonDatetime2XSDDatetime(self.getCreationDate())) root.set("name", str(self.getName())) xmlWorkspaceConfig = etree.SubElement(root, "{" + WORKSPACE_NAMESPACE + "}configuration") relTracePath = os.path.relpath(self.getPathOfTraces(), self.path) xmlTraces = etree.SubElement(xmlWorkspaceConfig, "{" + WORKSPACE_NAMESPACE + "}traces") xmlTraces.text = str(self.getPathOfTraces()) xmlLogging = etree.SubElement(xmlWorkspaceConfig, "{" + WORKSPACE_NAMESPACE + "}logging") xmlLogging.text = str(self.getPathOfLogging()) xmlPrototypes = etree.SubElement(xmlWorkspaceConfig, "{" + WORKSPACE_NAMESPACE + "}prototypes") xmlPrototypes.text = str(self.getPathOfPrototypes()) xmlPrototypes = etree.SubElement(xmlWorkspaceConfig, "{" + WORKSPACE_NAMESPACE + "}enable_bug_reporting") xmlPrototypes.text = str(self.enableBugReporting).lower() xmlWorkspaceProjects = etree.SubElement(root, "{" + WORKSPACE_NAMESPACE + "}projects") for projectPath in self.getProjectsPath(): xmlProject = etree.SubElement(xmlWorkspaceProjects, "{" + WORKSPACE_NAMESPACE + "}project") xmlProject.set("path", projectPath) xmlWorkspaceImported = etree.SubElement(root, "{" + WORKSPACE_NAMESPACE + "}traces") for importedTrace in self.getImportedTraces(): # overrideTraces variable contains the list of # ImportedTraces that should be overriden. This is useful # in case of message removal for example. forceOverride = (importedTrace.id in overrideTraces) importedTrace.save(xmlWorkspaceImported, WORKSPACE_NAMESPACE, COMMON_NAMESPACE, os.path.join(self.path, self.getPathOfTraces()), forceOverride) xmlWorkspaceFunctions = etree.SubElement(root, "{" + WORKSPACE_NAMESPACE + "}functions") for function in self.getCustomFunctions(): function.save(xmlWorkspaceFunctions, WORKSPACE_NAMESPACE) tree = ElementTree(root) tree.write(workspaceFile, pretty_print=True)
def save(self, filePath): tree = XMLElementTree(self.toXML()) indent(tree, space=" ") tree.write(filePath, xml_declaration=True, encoding="utf-8", pretty_print=True)
def print_predictions(agency, stops, label=""): title_index = build_title_index(stops) url = build_url(agency, stops) debug("NextBus predictions for %s: %s" % (agency, url)) f = urllib.urlopen(url) e = ElementTree(file=f) predictions = e.findall("//predictions") predictions = filter(lambda el: el.find(".//prediction") is not None, predictions) predictions.sort(key=lambda el: el.find(".//prediction").get("epochTime")) for n, p in enumerate(predictions): routeTag = p.get("routeTag") stopTag = p.get("stopTag") title = title_index.get((routeTag, stopTag), False) if title: title = "<em>%s</em>" % (title.replace("\n", "<br>"), ) else: title = p.get("routeTitle") title = re.sub(r'^Saferide ', '', title) title = label + title print "<h2>"+title+"</h2>" times = p.findall(".//prediction") print "<ol class='predictions'>" print '<li>%s</li>' % minutes(times.pop(0).get("minutes")) for t in times[0:2]: print '<li>%s</li>' % minutes(t.get("minutes")) print "</ol>"
def get_chapters(request): params = {} results = {} language = "pt" # language = request.LANGUAGE_CODE.lower() # if language == 'pt-br': # language = 'pt' params = urllib.urlencode({"LI": "CAPITULO"}) resource = urllib.urlopen(settings.ICD10_SERVICE, params) tree = ElementTree() tree.parse(resource) terms = tree.findall("cid10ws_response") data = [] for term in terms: description = {} chapter = term.findall("tree/self/term_list/term")[0] for lang in ICD10_LANGS: term_trans = term.findall('record_list/record/descriptor_list/descriptor[@lang="%s"]' % lang)[0] if term_trans.text: description[lang] = "%s - %s" % (chapter.attrib["chapter"], term_trans.text.strip().capitalize()) data.append({"fields": {"description": description, "label": chapter.attrib["tree_id"]}}) return HttpResponse(json.dumps(data), mimetype="application/json")
def dumpVOCAnnotations(output_folder, filename, size, names, bounding_boxes): node_root = Element('annotation') SubElement(node_root, 'folder').text = 'images' SubElement(node_root, 'filename').text = filename + '.jpg' SubElement(node_root, 'path').text = os.path.join(output_folder, 'images') node_size = SubElement(node_root, 'size') SubElement(node_size, 'width').text = str(size[1]) SubElement(node_size, 'height').text = str(size[0]) SubElement(node_size, 'depth').text = str(size[2]) SubElement(node_root, 'segmented').text = '0' for name, bb in zip(names, bounding_boxes): node_object = SubElement(node_root, 'object') SubElement(node_object, 'name').text = name SubElement(node_object, 'pose').text = 'Unspecified' SubElement(node_object, 'truncated').text = '0' SubElement(node_object, 'difficult').text = '0' node_bndbox = SubElement(node_object, 'bndbox') SubElement(node_bndbox, 'xmin').text = str(bb[0]) SubElement(node_bndbox, 'ymin').text = str(bb[1]) SubElement(node_bndbox, 'xmax').text = str(bb[2]) SubElement(node_bndbox, 'ymax').text = str(bb[3]) tree = ElementTree(node_root) tree.write(os.path.join(output_folder, 'annotations', filename) + '.xml', pretty_print=True)
def scrape_pre_2003_legislators(self, chamber, year, session, suffix): url = 'http://leg.mt.gov/css/Sessions/%d%s/legname.asp' % (session, suffix) legislator_page = ElementTree(lxml.html.fromstring(self.urlopen(url))) if year == 2001: if chamber == 'upper': tableName = '57th Legislatore Roster Senate (2001-2002)' startRow = 3 else: tableName = '57th Legislator Roster (House)(2001-2002)' startRow = 5 elif year == 1999: if chamber == 'upper': tableName = 'Members of the Senate' startRow = 3 else: tableName = 'Members of the House' startRow = 5 for table in legislator_page.xpath("//table"): if table.attrib.has_key('name') and table.attrib['name'] == tableName: parse_names = False for row in table.getchildren(): if row.tag != 'tr': continue celldata = row.getchildren()[0].text_content().strip() if parse_names and len(celldata) != 0: name, party_letter = celldata.rsplit(' (', 1) party_letter = party_letter[0] nameParts = [namePart.strip() for namePart in name.split(',')] assert len(nameParts) < 4 if len(nameParts) == 2: last_name, first_name = nameParts elif len(nameParts) == 3: last_name = ' '.join(nameParts[0:2]) first_name = nameParts[2] else: name, party_letter = celldata.rsplit(' (', 1) district = row.getchildren()[2].text_content().strip() if party_letter == 'R': party = 'Republican' elif party_letter == 'D': party = 'Democrat' else: party = party_letter legislator = Legislator(session, chamber, district, '%s %s' % (first_name, last_name), \ first_name, last_name, '', party) legislator.add_source(url) self.save_legislator(legislator) if celldata == "Name (Party)": # The table headers seem to vary in size, but the last row # always seems to start with 'Name (Party)' -- once we find # that, start parsing legislator names parse_names = True
def serialize_browse_layers(browse_layers, stream=None, pretty_print=False): if not stream: stream = StringIO() browse_layers_elem = Element(ns_cfg("browseLayers"), nsmap={"cfg": ns_cfg.uri}) for browse_layer in browse_layers: bl_elem = SubElement(browse_layers_elem, ns_cfg("browseLayer"), attrib={"browseLayerId": browse_layer.id}) rgb = browse_layer.r_band, browse_layer.g_band, browse_layer.b_band has_rgb = len(filter(lambda v: v is not None, rgb)) == 3 ri = browse_layer.radiometric_interval_min, browse_layer.radiometric_interval_max has_ri = len(filter(lambda v: v is not None, ri)) == 2 SubElement(bl_elem, ns_cfg("browseType")).text = browse_layer.browse_type SubElement(bl_elem, ns_cfg("title")).text = browse_layer.title if browse_layer.description is not None: SubElement(bl_elem, ns_cfg("description")).text = browse_layer.description SubElement(bl_elem, ns_cfg("grid")).text = browse_layer.grid SubElement(bl_elem, ns_cfg( "browseAccessPolicy")).text = browse_layer.browse_access_policy SubElement(bl_elem, ns_cfg("hostingBrowseServerName")).text = "" rel_ds_elem = SubElement(bl_elem, ns_cfg("relatedDatasetIds")) for rel_ds_id in browse_layer.related_dataset_ids: SubElement(rel_ds_elem, ns_cfg("datasetId")).text = rel_ds_id SubElement( bl_elem, ns_cfg("containsVerticalCurtains") ).text = "true" if browse_layer.contains_vertical_curtains else "false" if has_rgb: SubElement(bl_elem, ns_cfg("rgbBands")).text = ",".join(map(str, rgb)) if has_ri: ri_elem = SubElement(bl_elem, ns_cfg("radiometricInterval")) SubElement(ri_elem, ns_cfg("min")).text = str(ri[0]) SubElement(ri_elem, ns_cfg("max")).text = str(ri[1]) SubElement(bl_elem, ns_cfg("highestMapLevel")).text = str( browse_layer.highest_map_level) SubElement(bl_elem, ns_cfg("lowestMapLevel")).text = str( browse_layer.lowest_map_level) SubElement(bl_elem, ns_cfg("timeDimensionDefault")).text = str( browse_layer.timedimension_default) SubElement(bl_elem, ns_cfg("tileQueryLimit")).text = str( browse_layer.tile_query_limit) # TODO: encoding et = ElementTree(browse_layers_elem) et.write(stream, pretty_print=pretty_print, encoding="utf-8", xml_declaration=True) return stream
def scrape_pre_58_legislators(self, chamber, term, suffix): url = 'http://leg.mt.gov/css/Sessions/%s%s/legname.asp' % (term, suffix) legislator_page = ElementTree(lxml.html.fromstring(self.urlopen(url))) if term == '57': if chamber == 'upper': tableName = '57th Legislatore Roster Senate (2001-2002)' startRow = 3 else: tableName = '57th Legislator Roster (House)(2001-2002)' startRow = 5 elif term == '56': if chamber == 'upper': tableName = 'Members of the Senate' startRow = 3 else: tableName = 'Members of the House' startRow = 5 for table in legislator_page.xpath("//table"): if table.attrib.has_key('name') and table.attrib['name'] == tableName: parse_names = False for row in table.getchildren(): if row.tag != 'tr': continue celldata = row.getchildren()[0].text_content().strip() if parse_names and len(celldata) != 0: name, party_letter = celldata.rsplit(' (', 1) party_letter = party_letter[0] nameParts = [namePart.strip() for namePart in name.split(',')] assert len(nameParts) < 4 if len(nameParts) == 2: last_name, first_name = nameParts elif len(nameParts) == 3: last_name = ' '.join(nameParts[0:2]) first_name = nameParts[2] else: name, party_letter = celldata.rsplit(' (', 1) district = row.getchildren()[2].text_content().strip() if party_letter == 'R': party = 'Republican' elif party_letter == 'D': party = 'Democrat' else: party = party_letter legislator = Legislator(term, chamber, district, '%s %s' % (first_name, last_name), \ first_name, last_name, '', party) legislator.add_source(url) self.save_legislator(legislator) if celldata == "Name (Party)": # The table headers seem to vary in size, but the last row # always seems to start with 'Name (Party)' -- once we find # that, start parsing legislator names parse_names = True
def parse_bill(self, bill_url, session, chamber): # Temporarily skip the differently-formatted house budget bill. if 'billhtml/hb0002.htm' in bill_url.lower(): return bill = None try: doc = lxml.html.fromstring(self.get(bill_url).text) except XMLSyntaxError as e: self.logger.warning("Got %r while parsing %r" % (e, bill_url)) return bill_page = ElementTree(doc) for anchor in bill_page.findall('//a'): if (anchor.text_content().startswith('status of') or anchor.text_content().startswith( 'Detailed Information (status)')): status_url = anchor.attrib['href'].replace("\r", "").replace( "\n", "") bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) if bill is None: # No bill was found. Maybe something like HB0790 in the 2005 session? # We can search for the bill metadata. page_name = bill_url.split("/")[-1].split(".")[0] bill_type = page_name[0:2] bill_number = page_name[2:] laws_year = self.metadata['session_details'][session]['years'][ 0] % 100 status_url = self.search_url_template % (laws_year, bill_type, bill_number) bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) # Get versions on the detail page. versions = [a['action'] for a in bill['actions']] versions = [a for a in versions if 'Version Available' in a] if not versions: version_name = 'Introduced' else: version = versions.pop() if 'New Version' in version: version_name = 'Amended' elif 'Enrolled' in version: version_name = 'Enrolled' self.add_other_versions(bill) # Add pdf. url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop() bill.add_version(version_name, url, mimetype='application/pdf') # Add status url as a source. bill.add_source(status_url) return bill
def main(input_file, output_file): root = ElementTree().parse(input_file) for screen in root.findall('screen'): print("====", screen.get('name')) process(screen) for widget in screen: process(widget) ElementTree(root).write(output_file)
def __init__(self, path): et = ElementTree() parser = XMLParser(remove_comments=True, strip_cdata=True) self.__doc = et.parse(path, parser=parser) if self.__doc is None: raise PomLoadingException( "Failed to load pom.xml. You have a problem")
def lxml_get_files(page_response): tree = ElementTree(file=io.BytesIO(page_response.content), parser=HTMLParser()).getroot() all_a_elements = tree.cssselect('pre a') dir_and_files_list = [i.get("href") for i in all_a_elements] file_list = [i for i in dir_and_files_list if not i.endswith("/")] return file_list
def dump_xml(inf, outf, pretty_print=True): root = Element("wild") game = to_xml(inf, "firered") # XXX root.append(game) outf.write("""<?xml version="1.0" encoding="utf-8"?>\n""") xml = ElementTree(root) xml.write(outf, pretty_print=pretty_print)
def parse_catalog_ref(element: etree.ElementTree): xlink_ns = element.nsmap.get("xlink") href = element.get("{%s}href" % xlink_ns) title = element.get("{%s}title" % xlink_ns) id = element.get("ID") if id is None: id = title return DotDict(href=href, title=title, id=id)
def _get_asserts(self, content: etree.ElementTree) -> List[Dict[str, Any]]: """ Получение списка проверок Schematron-выражений. """ assertions = content.findall('.//xs:appinfo', namespaces=content.nsmap) assert_list = [] for assertion in assertions: for pattern in assertion: name = pattern.attrib.get('name', None) if not name: continue for rule in pattern: context = rule.attrib['context'] # Пропуск проверок, родительский элемент # которых может не встречаться, minOccurs=0 occurs_elements = assertion.xpath( f'ancestor::*[@minOccurs=0]') if len(occurs_elements): continue # Проверка, присутствует ли контекст в xml файле if len(self.xml_content.xpath(f'//{context}')) == 0: # Не найден контекст в xml файле # Пропуск опциональных проверок, choice choice_elements = assertion.xpath(f'ancestor::xs:choice', namespaces=content.nsmap) if len(choice_elements): # Опциональная проверка, пропускаем continue # Пропуск опциональных проверок, minOccurs="0" is_optional = True min_occurs = content.xpath(f'//xs:element[@name="{context}"]/@minOccurs', namespaces=content.nsmap) for occur_attrib in min_occurs: # Ошибка, проверка обязательна, контекст не найден if occur_attrib != '0': is_optional = False break # Ошибка, проверка обязательна, контекст не найден if not is_optional: raise ContextError(context, self.filename) for sch_assert in rule: for error_node in sch_assert: error = self._get_error(error_node) assert_list.append({ 'name': name, 'assert': sch_assert.attrib['test'], 'context': context, 'error': error }) return assert_list
def parse_bill(self, bill_url, session, chamber): # Temporarily skip the differently-formatted house budget bill. if "/2011/billhtml/hb0002.htm" in bill_url.lower(): return bill = None try: doc = lxml.html.fromstring(self.urlopen(bill_url)) except XMLSyntaxError as e: self.logger.warning("Got %r while parsing %r" % (e, bill_url)) return bill_page = ElementTree(doc) for anchor in bill_page.findall("//a"): if anchor.text_content().startswith("status of") or anchor.text_content().startswith( "Detailed Information (status)" ): status_url = anchor.attrib["href"].replace("\r", "").replace("\n", "") bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) if bill is None: # No bill was found. Maybe something like HB0790 in the 2005 session? # We can search for the bill metadata. page_name = bill_url.split("/")[-1].split(".")[0] bill_type = page_name[0:2] bill_number = page_name[2:] laws_year = self.metadata["session_details"][session]["years"][0] % 100 status_url = self.search_url_template % (laws_year, bill_type, bill_number) bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) # Get versions on the detail page. versions = [a["action"] for a in bill["actions"]] versions = [a for a in versions if "Version Available" in a] if not versions: version_name = "Introduced" else: version = versions.pop() if "New Version" in version: version_name = "Amended" elif "Enrolled" in version: version_name = "Enrolled" self.add_other_versions(bill) # Add html. bill.add_version(version_name, bill_url, mimetype="text/html") # Add pdf. url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop() bill.add_version(version_name, url, mimetype="application/pdf") # Add status url as a source. bill.add_source(status_url) return bill
def remove_circle(tree: etree.ElementTree): """ Remove the circle. """ root = tree.getroot() for child in tree.findall("{http://www.w3.org/2000/svg}circle"): if 'id' in child.attrib and child.attrib['id'] == "circle": root.remove(child) break
def dict_to_xml(obj): """ Return a xml representation of the given dictionary: 1. keys of the dictionary become sublements. 2. if a value is a list, then key is a set of sublements. 3. keys starting with '@' became an attribute. {'duck': {'birth_date': '1934-06-04T00:00:00', 'created_by': {'@href': 'http://en.wikipedia.org/wiki/Walt_Disney', 'cryopreserved': True, 'name': 'Walt Disney'}, 'family': {'nephew': [{'name': 'Huey'}, {'name': 'Dewey'}, {'name': 'Louie'}], 'children': [], 'uncles': {'uncle': [{'name': 'Scrooge McDuck'}, {'name': 'Ludwig Von Drake'}]}}, 'first_film': None, 'last_film': None, 'name': 'Donald', 'species': {'@href': 'http://en.wikipedia.org/wiki/Pekin_duck'}} } <?xml version="1.0" encoding="UTF-8"?> <duck> <name>Donald</name> <family> <children /> <nephew><name>Huey</name></nephew> <nephew><name>Dewey</name></nephew> <nephew><name>Louie</name></nephew> <uncles> <uncle><name>Scrooge McDuck</name></uncle> <uncle><name>Ludwig Von Drake</name></uncle> </uncles> </family> <last_film /> <first_film /> <created_by href="http://en.wikipedia.org/wiki/Walt_Disney"> <cryopreserved>True</cryopreserved> <name>Walt Disney</name> </created_by> <birth_date>1934-06-04T00:00:00</birth_date> <species href="http://en.wikipedia.org/wiki/Pekin_duck" /> </duck> """ if not obj: return # top level dictionary must contain a single entry # corresponding to the root element key, value = obj.popitem() root = etree.Element(key) element_for_value(value, root) return (b'<?xml version="1.0" encoding="UTF-8"?>' + etree.tostring(root, encoding='utf-8'))
def xml(data, xmlFileName): from lxml.etree import ElementTree, Element, SubElement, Comment, tostring from collections import OrderedDict xsi = 'http://www.w3.org/2001/XMLSchema-instance' noNamespaceSchemaLocation = "{%s}noNamespaceSchemaLocation" % xsi doc = Element('transcript', {noNamespaceSchemaLocation: 'transcript_new.xsd'}) #, { 'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', # 'xsi:noNamespaceSchemaLocation': 'transcript_new.xsd' }) head = SubElement(doc, 'head') recording = SubElement(head, 'recording') annotations = SubElement(head, 'annotations') annotation_id = 'transcript_manual' annotation = SubElement(annotations, 'annotation', {'id': annotation_id}) speakers = SubElement(head, 'speakers') speakerSet = data['speaker'] if 'speaker' in data else set() programSet = set() body = SubElement(doc, 'body') segments = SubElement(body, 'segments', {'annotation_id': annotation_id}) programId = data['id'] wordCount = 0 for i, e in enumerate(data['turn']): tokens = e.text.split() startTime = e.startTime endTime = e.endTime averageWordDuration = (e.endTime - e.startTime) / len(tokens) speakerName = e.speaker if e.speaker else "{}_unknown_{}".format( programId, i) if speakerName not in speakerSet: speaker = SubElement( speakers, 'speaker', OrderedDict([('id', speakerName), ('name', speakerName)])) speakerSet.add(speaker) segment = SubElement( segments, 'segment', OrderedDict([('id', "{}_utt_{}".format(programId, i)), ('starttime', str(startTime)), ('endtime', str(endTime)), ('AWD', "{:2f}".format(averageWordDuration)), ('PMER', "0.0"), ('WMER', "0.0"), ('who', speakerName)])) for word in tokens: element = SubElement( segment, 'element', OrderedDict([('id', "{}_w{}".format(programId, wordCount)), ('type', 'word')])) element.text = word wordCount += 1 tree = ElementTree(doc) tree.write(xmlFileName, encoding='utf-8', xml_declaration=True, pretty_print=True)
def load(pom_path): et = ElementTree() parser = XMLParser(remove_comments=True, strip_cdata=True) try: doc = et.parse(pom_path, parser=parser) except IOError: raise PomLoadingException("Cannot read file {f}".format(f=pom_path)) if doc is None: raise PomLoadingException("Failed to load {f}".format(f=pom_path)) return doc
def produce_settings_file(self, printer): printer_name = printer['name'] tree = ElementTree() tree.parse( StringIO( XML_TEMPLATE.replace('\n', '').replace('\r', '').replace('\t', ''))) name_node = tree.find('name') name_node.text = 'AirPrint %s @ %%h' % printer_name service_node = tree.find('service') port_node = service_node.find('port') port_node.text = '%d' % printer['port'] host = printer['host'] if host: if self.dnsdomain: pair = host.rsplit('.', 1) if len(pair) > 1: host = '.'.join((pair[0], self.dnsdomain)) service_node.append(self.new_node('host-name', host)) txt = printer['txt'] for key in txt: if self.adminurl or key != 'adminurl': service_node.append( self.new_txtrecord_node('%s=%s' % (key, txt[key]))) source = printer['SOURCE'] if printer.has_key('SOURCE') else '' fname = '%s%s%s.service' % (self.prefix, '%s-' % source if len(source) > 0 else '', printer_name) if self.directory: fname = os.path.join(self.directory, fname) f = open(fname, 'w') if etree: tree.write(f, pretty_print=True, xml_declaration=True, encoding="UTF-8") else: xmlstr = tostring(tree.getroot()) doc = parseString(xmlstr) dt = minidom.getDOMImplementation('').createDocumentType( 'service-group', None, 'avahi-service.dtd') doc.insertBefore(dt, doc.documentElement) doc.writexml(f) f.close() if self.verbose: src = source if len(source) > 0 else 'unknown' sys.stderr.write('Created from %s: %s%s' % (src, fname, os.linesep))
def from_wordpress_xml_file(cls, filename): x = ElementTree(file=filename) new_importer = WordpressImporter() new_importer.get_original_blog_address(x) new_importer.posts = [new_importer.convert_posts(post) for post in x.getiterator() if cls.is_valid_post(post)] new_importer.pages = [new_importer.convert_posts(page) for page in x.getiterator() if cls.is_valid_page(page)] cls.sort_posts_by_date(new_importer.posts) return new_importer
def parse_bill(self, bill_url, session, chamber): # Temporarily skip the differently-formatted house budget bill. if '/2011/billhtml/hb0002.htm' in bill_url.lower(): return bill = None bill_page = ElementTree(lxml.html.fromstring(self.urlopen(bill_url))) for anchor in bill_page.findall('//a'): if (anchor.text_content().startswith('status of') or anchor.text_content().startswith('Detailed Information (status)')): status_url = anchor.attrib['href'].replace("\r", "").replace("\n", "") bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) if bill is None: # No bill was found. Maybe something like HB0790 in the 2005 session? # We can search for the bill metadata. page_name = bill_url.split("/")[-1].split(".")[0] bill_type = page_name[0:2] bill_number = page_name[2:] laws_year = self.metadata['session_details'][session]['years'][0] % 100 status_url = self.search_url_template % (laws_year, bill_type, bill_number) bill = self.parse_bill_status_page(status_url, bill_url, session, chamber) # Get versions on the detail page. versions = [a['action'] for a in bill['actions']] versions = [a for a in versions if 'Version Available' in a] if not versions: version_name = 'Introduced' else: version = versions.pop() if 'New Version' in version: version_name = 'Amended' elif 'Enrolled' in version: version_name = 'Enrolled' self.add_other_versions(bill) # Add html. bill.add_version(version_name, bill_url, mimetype='text/html') # Add pdf. url = set(bill_page.xpath('//a/@href[contains(., "BillPdf")]')).pop() bill.add_version(version_name, url, mimetype='application/pdf') # Add status url as a source. bill.add_source(status_url) return bill
def fetch_text_from_url(url): """Simple helper to scrap the text content of a webpage""" opener = urllib2.build_opener() request = urllib2.Request(url) # change the User Agent to avoid being blocked by Wikipedia # downloading a couple of articles ones should not be abusive request.add_header('User-Agent', 'pignlproc categorizer') html_content = opener.open(request).read() tree = ElementTree(lxml.html.document_fromstring(html_content)) elements = [e.text_content() for tag in ('h1', 'h2', 'h3', 'h4', 'p') for e in tree.findall('//' + tag)] text = "\n\n".join(elements) return text
def exportProjectApplyButton_clicked_cb(self, button): """Display the dialog in order to export the current project when the user request it through the menu.""" logging.debug("Export project") try: selectedFolder = self.view.exportProjectFileChooserButton.get_current_folder() filename = self.view.exportProjectFilenameEntry.get_text() if selectedFolder is None: raise Exception(_("No directory selected")) elif filename is None or len(filename) == 0: raise Exception(_("No filename provided")) else: outputFilename = os.path.join(selectedFolder, filename) logging.debug("Output filename: {0}".format(outputFilename)) overwrite = True if os.path.exists(outputFilename): questionMsg = _("A file named \"{0}\" already exists. Do you want to replace it?").format(filename) dialog = Gtk.MessageDialog(self.view.exportProject, Gtk.DialogFlags.MODAL | Gtk.DialogFlags.DESTROY_WITH_PARENT, Gtk.MessageType.WARNING, Gtk.ButtonsType.NONE, questionMsg) dialog.format_secondary_text(_("The file already exists in \"{0}\". Replacing it will overwrite its contents.").format(selectedFolder)) dialog.add_button(Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL) dialog.add_button(_("Replace"), Gtk.ResponseType.YES) dialog.set_default_response(Gtk.ResponseType.YES) response = dialog.run() dialog.destroy() if response == Gtk.ResponseType.CANCEL: self.view.destroy() self.log.info("Export was cancelled") return xmlDefinitionOfProject = self.project.generateXMLConfigFile() tree = ElementTree(xmlDefinitionOfProject) tree.write(outputFilename, pretty_print=True) self.view.destroy() except Exception, e: self.view.showErrorMessage(_("An error occurred while exporting the project.")) logging.warn("Error when exporting project: {0}".format(e))
def test_etree_from_file(self): with open('sample.xml') as f: et = ElementTree(file=f) root = et.getroot() self.assertEqual('{http://example.tld}document', root.tag) self.assertEqual('x', root.prefix) self.assertTrue('x' in root.nsmap) with open('hello.xml') as f: et = ElementTree(file=f) root = et.getroot() self.assertEqual('hello', root.tag) self.assertEqual(None, root.prefix) self.assertEqual({}, root.nsmap)
def serialize_browse_layers(browse_layers, stream=None, pretty_print=False): if not stream: stream = StringIO() browse_layers_elem = Element(ns_cfg("browseLayers"), nsmap={"cfg": ns_cfg.uri}) for browse_layer in browse_layers: bl_elem = SubElement( browse_layers_elem, ns_cfg("browseLayer"), attrib={"browseLayerId": browse_layer.id} ) rgb = browse_layer.r_band, browse_layer.g_band, browse_layer.b_band has_rgb = len(filter(lambda v: v is not None, rgb)) == 3 ri = browse_layer.radiometric_interval_min, browse_layer.radiometric_interval_max has_ri = len(filter(lambda v: v is not None, ri)) == 2 SubElement(bl_elem, ns_cfg("browseType")).text = browse_layer.browse_type SubElement(bl_elem, ns_cfg("title")).text = browse_layer.title if browse_layer.description is not None: SubElement(bl_elem, ns_cfg("description")).text = browse_layer.description SubElement(bl_elem, ns_cfg("grid")).text = browse_layer.grid SubElement(bl_elem, ns_cfg("browseAccessPolicy")).text = browse_layer.browse_access_policy SubElement(bl_elem, ns_cfg("hostingBrowseServerName")).text = "" rel_ds_elem = SubElement(bl_elem, ns_cfg("relatedDatasetIds")) for rel_ds_id in browse_layer.related_dataset_ids: SubElement(rel_ds_elem, ns_cfg("datasetId")).text = rel_ds_id SubElement(bl_elem, ns_cfg("containsVerticalCurtains")).text = "true" if browse_layer.contains_vertical_curtains else "false" SubElement(bl_elem, ns_cfg("contains_volumes")).text = "true" if browse_layer.contains_volumes else "false" if has_rgb: SubElement(bl_elem, ns_cfg("rgbBands")).text = ",".join(map(str, rgb)) if has_ri: ri_elem = SubElement(bl_elem, ns_cfg("radiometricInterval")) SubElement(ri_elem, ns_cfg("min")).text = str(ri[0]) SubElement(ri_elem, ns_cfg("max")).text = str(ri[1]) SubElement(bl_elem, ns_cfg("highestMapLevel")).text = str(browse_layer.highest_map_level) SubElement(bl_elem, ns_cfg("lowestMapLevel")).text = str(browse_layer.lowest_map_level) SubElement(bl_elem, ns_cfg("timeDimensionDefault")).text = str(browse_layer.timedimension_default) SubElement(bl_elem, ns_cfg("tileQueryLimit")).text = str(browse_layer.tile_query_limit) # TODO: encoding et = ElementTree(browse_layers_elem) et.write(stream, pretty_print=pretty_print, encoding="utf-8", xml_declaration=True) return stream
def saveConfigFile(self, workspace): projectPath = os.path.join(workspace.getPath(), self.getPath()) projectFile = os.path.join(projectPath, Project.CONFIGURATION_FILENAME) logging.info("Save the config file of project {0} in {1}".format(self.getName(), projectFile)) # First we verify and create if necessary the directory of the project if not os.path.exists(projectPath): logging.info("Creation of the directory: {0}".format(projectPath)) os.mkdir(projectPath) # We generate the XML Config file root = self.generateXMLConfigFile() tree = ElementTree(root) tree.write(projectFile, pretty_print=True)
def __init__(self, name): self.name = name tree = ElementTree(file = name) labels = tuple((elt.tag.strip(), elt.text.strip()) for elt in tree.find("labels")) self.labels = tuple(pair[0] for pair in labels) self.descrs = dict(labels) self.date = tree.getroot().get("date") for elt in tree.findall("validation_status"): status = elt.get("status") uri = elt.text.strip() if status.startswith("rsync_transfer_") or elt.get("generation") != "current": continue if uri not in self: self[uri] = Object(self, uri) self[uri].add(status)
def loadWorkspace_0_1(workspacePath, workspaceFile): # Parse the XML Document as 0.1 version tree = ElementTree() tree.parse(workspaceFile) xmlWorkspace = tree.getroot() wsName = xmlWorkspace.get('name', 'none') wsCreationDate = TypeConvertor.xsdDatetime2PythonDatetime(xmlWorkspace.get('creation_date')) # Parse the configuration to retrieve the main paths xmlWorkspaceConfig = xmlWorkspace.find("{" + WORKSPACE_NAMESPACE + "}configuration") pathOfTraces = xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}traces").text pathOfLogging = None if xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}logging") != None and xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}logging").text != None and len(xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}logging").text) > 0: pathOfLogging = xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}logging").text pathOfPrototypes = None if xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}prototypes") != None and xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}prototypes").text != None and len(xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}prototypes").text) > 0: pathOfPrototypes = xmlWorkspaceConfig.find("{" + WORKSPACE_NAMESPACE + "}prototypes").text lastProject = None if xmlWorkspace.find("{" + WORKSPACE_NAMESPACE + "}projects") != None: xmlProjects = xmlWorkspace.find("{" + WORKSPACE_NAMESPACE + "}projects") if xmlProjects.get("last", "none") != "none": lastProject = xmlProjects.get("last", "none") # Instantiation of the workspace workspace = Workspace(wsName, wsCreationDate, workspacePath, pathOfTraces, pathOfLogging, pathOfPrototypes) # Load the already imported traces if xmlWorkspace.find("{" + WORKSPACE_NAMESPACE + "}traces") != None: xmlTraces = xmlWorkspace.find("{" + WORKSPACE_NAMESPACE + "}traces") for xmlTrace in xmlTraces.findall("{" + WORKSPACE_NAMESPACE + "}trace"): trace = ImportedTrace.loadTrace(xmlTrace, WORKSPACE_NAMESPACE, COMMON_NAMESPACE, "0.1", workspace.getPathOfTraces()) if trace != None: workspace.addImportedTrace(trace) # Reference the projects if xmlWorkspace.find("{" + WORKSPACE_NAMESPACE + "}projects") != None: for xmlProject in xmlWorkspace.findall("{" + WORKSPACE_NAMESPACE + "}projects/{" + WORKSPACE_NAMESPACE + "}project"): project_path = xmlProject.get("path") workspace.referenceProject(project_path) if project_path == lastProject and lastProject != None: workspace.referenceLastProject(lastProject) return workspace
def add_bill_versions(self, bill, index_url): # This method won't pick up bill versions where the bill is published # exclusively in PDF. See 2009 HB 645 for a sample index_page = ElementTree(lxml.html.fromstring(self.urlopen(index_url))) tokens = bill["bill_id"].split(" ") bill_regex = re.compile("%s0*%s\_" % (tokens[0], tokens[1])) for anchor in index_page.findall("//a"): if bill_regex.match(anchor.text_content()) is not None: file_name = anchor.text_content() version = file_name[file_name.find("_") + 1 : file_name.find(".")] version_title = "Final Version" if version != "x": version_title = "Version %s" % version version_url = index_url[0 : index_url.find("bills") - 1] + anchor.attrib["href"] bill.add_version(version_title, version_url)