def __init__(self): self.__conf = Configure() self.__associations = Associations() self.__sites = Sites() resultList = [] xReader = XMLReader() xParser = XMLParser() confTree = xReader.getTree('xml/conf.xml') if confTree == None: exit() searchParams = xParser.getSearchParams(confTree) searchSites = xParser.getSearchSites(confTree) pagesToSearch = xParser.getPagesToSearch(confTree) self.masterInspectionPath = xParser.getMIXML(confTree) self.__conf.setParams(searchSites, searchParams, pagesToSearch) keywordTree = xReader.getTree('xml/keywords.xml') fKeywordTree = xReader.getTree('xml/f_keywords.xml') if keywordTree == None or fKeywordTree == None: exit() keywords = xParser.getKeywords(keywordTree) fKeywords = xParser.getKeywords(fKeywordTree) avoids = xParser.getAvoids(keywordTree) fAvoids = xParser.getAvoids(fKeywordTree) self.__associations.setParams(keywords, avoids, fKeywords, fAvoids) sitesTree = xReader.getTree('xml/sites.xml') if sitesTree == None: exit() goodSites, badSites = xParser.getSites(sitesTree) self.__sites.setParams(goodSites, badSites)
def getInspectionsStr(self, xml): xReader = XMLReader() xParser = XMLParser() tree = xReader.getTree(xml) XMLInspections = [] if tree == None: print(ind, xml, "Failed to read.") return link, score, url, fil, ID = xParser.getInspectionDataWithId(tree) for i in range(len(link)): XMLInspections.append(Inspection(link[i], score[i], url[i], fil[i], ID[i])) return XMLInspections
def getInspections(self, xmls): xReader = XMLReader() xParser = XMLParser() XMLInspections = [] for ind, xml in enumerate(xmls): tree = xReader.getTree(xml) if tree == None: print(ind, xml, "Failed to read.") continue link, score, url, fil = xParser.getInspectionData(tree) for i in range(len(link)): XMLInspections.append(Inspection(link[i], score[i], url[i], fil[i])) return XMLInspections
def __init__(self, vote, voteId, skip = False): if skip: return self.vote = vote self.voteId = voteId xReader = XMLReader() xParser = XMLParser() confTree = xReader.getTree('xml/conf.xml') if confTree == None: print('Abort. Failed to read xml/conf.xml') exit() self.masterInspectionPath = xParser.getMIXML(confTree)
def updateSitesXMl(self): xReader = XMLReader() xParser = XMLParser() xWriter = XMLWriter() tree = xReader.getTree('xml/sites.xml') gdSites, bdSites = xParser.getSites(tree) data = None for obj in self.XMLInspections: if obj.ID == self.voteId: data = obj break if self.vote == "up": gdSites.append(obj.url) else: bdSites.append(obj.url) xWriter.writeSitesXML(gdSites,bdSites,'xml/sites.xml')
def storeWords(self): self.wl = WordList() xReader = XMLReader() xParser = XMLParser() if xReader.checkIfExistsQuiet('xml/words.xml'): tree = xReader.getTree('xml/words.xml') wordAvg, avgRatio = xParser.getGeneralFromWords(tree) self.wl = xParser.getWords(tree) usf = 0 usl = 0 if self.vote == "up": usf = 1 else: usl = 1 for ind, obj in enumerate(self.XMLInspections): if obj.ID != self.voteId: continue pl = PageLoader(obj.fil) if not pl.isReadable(): print('Abort. File not readable:', obj.fil) exit() pl.read() patt = "^[a-zA-Z0-9]*$" pl.linkWords = self.removeListElesNotPatterned(patt, pl.linkWords) pl.titleWords = self.removeListElesNotPatterned(patt, pl.titleWords) pl.headerWords = self.removeListElesNotPatterned(patt, pl.headerWords) pl.specialWords = self.removeListElesNotPatterned(patt, pl.specialWords) pl.normalWords = self.removeListElesNotPatterned(patt, pl.normalWords) for word in pl.linkWords: self.wl.append(word, usf, usl) for word in pl.titleWords: self.wl.append(word, usf, usl) for word in pl.headerWords: self.wl.append(word, usf, usl) for word in pl.specialWords: self.wl.append(word, usf, usl) for word in pl.normalWords: self.wl.append(word, usf, usl) return
class Parser: def __init__(self): self.__parser = XMLParser(global_define.XML_NODE_NAME, global_define.XML_TAG_NAME_LIST) pass def parse(self, file_path): xml_dicts = None try: xml_dicts = self.__parser.parse(file_path) except Exception,e: logger.error("%s file parse failed. [%s]" %(file_path, e)) return xml_dicts
def time_process(data_file): curr_time = dt.datetime.now() # run loop fobj = XMLParser(data_file, curr_time) lim = fobj.find_oldest_time() while curr_time > lim: curr_time -= TIME_INCR print('running time analysis for ' + str(curr_time.date())) fobj.update_time(curr_time) d = fobj.parse_to_dict() if d: net = NetworkParser(d) print("Analyzing File " + data_file + ' at time ' + str(curr_time.date())) na = NetworkAnalysis(net.G, os.path.basename(data_file), output_path, curr_time.date()) basic = na.d3dump(public_out_path, str(curr_time.date())) public_data_output = public_data + na.fileName + "/" if generate_data: # write out decentralized results na.write_permanent_data_json(public_data_output, str(curr_time.date()), basic) print("Completed Analyzing: " + data_file)
def get_parser(filename): parsers = [] parsers.append(PlaintextParser(filename)) try: parsers.append(LineParser(filename)) except ValueError: pass parsers.append(XMLParser(filename)) parsers.append(CtmParser(filename)) for parser in parsers: if parser.wants_this_file(): return parser return None
def main(arg): try: if len(arg) != 1: print("Must be only one parameter - path to xml-file.") return 1 xml_file = arg[0] if not os.path.isfile(xml_file): print("Invalid file - {}.".format(xml_file)) return 1 with open(xml_file, 'r') as f: buffer = f.read() Main.print_xml_tree( XMLParser(buffer).get_iterator(), Main.FOUR_SPACES) except Exception as exp: print("An error occurred: " + str(exp)) return -1
def process_file(data_file): curr_time = get_time() # Parse Into Network d = XMLParser(data_file, get_time()).parse_to_dict() net = NetworkParser(d) # Graph Analysis output("Analyzing File " + data_file) na = NetworkAnalysis(net.G, os.path.basename(data_file), output_path) na.outputBasicStats() na.outputNodesAndEdges() # na.nodeRemoval() basic = na.d3dump(public_out_path, str(curr_time)) # Run Decentralized Search if decentralized_search_settings["run_decentralized_search"]: hiearchyG = net.G.copy() category_hierarchy = CategoryBasedHierarchicalModel( hiearchyG, similarity_matrix_type=category_hierarchical_model_settings[ "similarity_matrix_type"], max_branching_factor_root=category_hierarchical_model_settings[ "max_branching_factor_root"]) category_hierarchy.build_hierarchical_model() decentralized_search_model = HierarchicalDecentralizedSearch( hiearchyG, category_hierarchy.hierarchy, na, detailed_print=decentralized_search_settings["detailed_print"], hierarchy_nodes_only=decentralized_search_settings[ "hierarchy_nodes_only"], apply_weighted_score=decentralized_search_settings[ "apply_weighted_score"], ) n_found, n_missing, av_path_len, av_unique_nodes, path_lengths_deciles = decentralized_search_model.run_decentralized_search( 1000, decentralized_search_settings["widen_search"], decentralized_search_settings["plots"]) basic.update({ "decentralized_num_paths_found": n_found, "decentralized_num_paths_missing": n_missing, "decentralized_average_decentralized_path_length": av_path_len, "decentralized_average_num_unique_nodes": av_unique_nodes, "hierarchy_num_nodes": (len(category_hierarchy.hierarchy.nodes()) - len(category_hierarchy.ranked_categories)), "hierarchy_num_cat_nodes": len(category_hierarchy.ranked_categories), "hierarchy_num_levels": category_hierarchy.num_hierarchy_levels }) basic["hierarchy_ratio_cat_nodes"] = basic[ "hierarchy_num_cat_nodes"] / basic["hierarchy_num_nodes"] path_lengths_deciles_dict = {} for i in range(len(path_lengths_deciles)): path_lengths_deciles_dict["path_length_" + str( (i + 1) * 10) + "_percentile"] = path_lengths_deciles[i] basic.update(path_lengths_deciles_dict) random_search_model = RandomSearch(net.G, na) n_found, n_missing, av_path_len, av_unique_nodes = random_search_model.run_search( 1000, decentralized_search_settings["widen_search"], decentralized_search_settings["plots"]) basic.update({ "random_num_paths_found": n_found, "random_num_paths_missing": n_missing, "random_average_decentralized_path_length": av_path_len, "random_average_num_unique_nodes": av_unique_nodes }) if generate_data: na.write_permanent_data_json( public_data, basic) # write out decentralized results # na.generateDrawing() output("Completed Analyzing: " + data_file)
def time_process(data_file): curr_time = dt.datetime.now() # run loop fobj = XMLParser(data_file, curr_time) lim = fobj.find_oldest_time() while curr_time > lim: curr_time -= TIME_INCR print('running time analysis for ' + str(curr_time)) fobj.update_time(curr_time) d = fobj.parse_to_dict() if d: net = NetworkParser(d) output("Analyzing File " + data_file + ' at time ' + str(curr_time)) na = NetworkAnalysis(net.G, os.path.basename(data_file), output_path, curr_time) basic = na.d3dump(public_out_path, str(curr_time)) # Run Decentralized Search try: if decentralized_search_settings[ "run_decentralized_search"]: hiearchyG = net.G.copy() category_hierarchy = CategoryBasedHierarchicalModel( hiearchyG, similarity_matrix_type= category_hierarchical_model_settings[ "similarity_matrix_type"], max_branching_factor_root= category_hierarchical_model_settings[ "max_branching_factor_root"]) category_hierarchy.build_hierarchical_model() decentralized_search_model = HierarchicalDecentralizedSearch( hiearchyG, category_hierarchy.hierarchy, na, detailed_print=decentralized_search_settings[ "detailed_print"], hierarchy_nodes_only=decentralized_search_settings[ "hierarchy_nodes_only"], apply_weighted_score=decentralized_search_settings[ "apply_weighted_score"], ) n_found, n_missing, av_path_len, av_unique_nodes, path_lengths_deciles = decentralized_search_model.run_decentralized_search( 1000, decentralized_search_settings["widen_search"], decentralized_search_settings["plots"]) basic.update({ "decentralized_num_paths_found": n_found, "decentralized_num_paths_missing": n_missing, "decentralized_average_decentralized_path_length": av_path_len, "decentralized_average_num_unique_nodes": av_unique_nodes, "hierarchy_num_nodes": (len(category_hierarchy.hierarchy.nodes()) - len(category_hierarchy.ranked_categories)), "hierarchy_num_levels": category_hierarchy.num_hierarchy_levels }) path_lengths_deciles_dict = {} for i in range(len(path_lengths_deciles)): path_lengths_deciles_dict["path_length_" + str((i + 1) * 10) + "_percentile"] = \ path_lengths_deciles[i] basic.update(path_lengths_deciles_dict) random_search_model = RandomSearch(net.G, na) n_found, n_missing, av_path_len, av_unique_nodes = random_search_model.run_search( 1000, decentralized_search_settings["widen_search"], decentralized_search_settings["plots"]) basic.update({ "random_num_paths_found": n_found, "random_num_paths_missing": n_missing, "random_average_decentralized_path_length": av_path_len, "random_average_num_unique_nodes": av_unique_nodes }) except: pass if generate_data: # write out decentralized results na.write_permanent_data_json(public_data, basic, str(curr_time.date())) output("Completed Analyzing: " + data_file)
from xml_parser import XML, XMLParser import openpyxl root_url = "/var/www/Ozone/Ozone/static/xml/" filenames = [ "mbt", "kbt" ] format = ".xml" for filename in filenames: parser = XMLParser(root_url + filename + format) offers = parser.ozon_offers_list() titles = parser.get_all_keys() categories = parser.get_all_categories() titles.sort(key = lambda x: str(offers[0:100]).count(x), reverse=True) wb = openpyxl.Workbook() ws = wb.create_sheet("offers") for key in titles: ws.cell(1, titles.index(key)+2, key) row = 2 for offer in offers: ws.cell(row, 1, str(categories[int(offer["categoryId"])])) col = 2
def __init__(self): self.__parser = XMLParser(global_define.XML_NODE_NAME, global_define.XML_TAG_NAME_LIST) pass