def createXmlandWrite(self, dataToStore, filename): if not dataToStore: return doc = Document() results = doc.createElement("results") doc.appendChild(results) for a in dataToStore: item = doc.createElement("item") for k, d in a.items(): tag = doc.createElement(k) if k == "author": tag2 = doc.createElement("name") data = doc.createTextNode(d) tag2.appendChild(data) tag.appendChild(tag2) else: data = doc.createTextNode(toUnicode(d)) tag.appendChild(data) item.appendChild(tag) results.appendChild(item) qwe = doc.toprettyxml(indent=" ", encoding = "UTF-8") writeToFile(qwe, "scf_"+filename, "/mnt/minerva1/nlp/projects/spinn3r/solr_data/xjerab13", #"/mnt/minerva1/nlp/projects/blogs_download2/sail_data_out", #"e:\\tmp\\tmp\\conv", extension=".xml", timestamp=True )
def createXmlandWrite(self, dataToStore, filename): if not dataToStore: return doc = Document() results = doc.createElement("results") doc.appendChild(results) for a in dataToStore: item = doc.createElement("item") for k, d in a.items(): tag = doc.createElement(k) if k == "author": tag2 = doc.createElement("name") data = doc.createTextNode(d) tag2.appendChild(data) tag.appendChild(tag2) else: data = doc.createTextNode(toUnicode(d)) tag.appendChild(data) item.appendChild(tag) results.appendChild(item) qwe = doc.toprettyxml(indent=" ", encoding="UTF-8") writeToFile( qwe, "scf_" + filename, "/mnt/minerva1/nlp/projects/spinn3r/solr_data/xjerab13", #"/mnt/minerva1/nlp/projects/blogs_download2/sail_data_out", #"e:\\tmp\\tmp\\conv", extension=".xml", timestamp=True)
def process(self, url): '''Download feed, process and create output''' content = getDataFrom(url, self.username, self.password) self.was_error = False self.doc = Document() #parsing feed feed = feedparser.parse(content) #if zero entries, no job appeared if len(feed.entries) <= 0: log.error("No Entries found in %s, you sure it is RSS or atom feed?", url) return None; #add static data to feed self.update_feed(feed , url) #create root tag t_results = self.doc.createElement("results") self.doc.appendChild(t_results) item_cnt = 0 timestamp = None if self.cache: timestamp = self.cache.loadFromCache(url) new_stamp = timestamp #print "timestamp ", #print new_stamp #walk over parsed feed, minning data and create final xml for _c_ in range(len(feed.entries)): t_item = self.doc.createElement("item") try: #chceck feed item publication date, save only newer items if new_stamp < feed.entries[_c_].date_parsed: new_stamp = feed.entries[_c_].date_parsed if timestamp >= feed.entries[_c_].date_parsed: continue except: pass #walking over xml tag and their values for tag,value in self.item_list.items(): path, func, req = value data = None if path is not None: try: f = feed for key in path: # _C_ as counter, mean number if key == "_C_": f = f[_c_] else: f = f[key] data = f except: pass #data postprocessing if func is not None: data = func(data) if req and data is None: log.error("Nenalezena data pro tag %s", tag) self.was_error = True if data is None: data = "" child = self.doc.createElement(tag) if isinstance(data, basestring): data = self.doc.createTextNode(toUnicode(data)) if isinstance(data, Node): child.appendChild(data) elif isinstance(data, dict): for id, data in data.items(): child.setAttribute(id,data) t_item.appendChild(child) t_results.appendChild(t_item) item_cnt += 1 if item_cnt > 0: #print "new stamp " #print new_stamp self.writeWrapper(urlparse(url).netloc.split(".")[-2]) if self.cache: self.cache.storeToCache(url,new_stamp) if self.makeBackup: writeToFile(content, "RSS_Sail_", os.path.join("rss_backup","sail"), ".xml", timestamp=True) else: log.debug("Nothing new in feed - %s " , url) return
def process(self, url): '''Download feed, process and create output''' content = getDataFrom(url, self.username, self.password) self.was_error = False self.doc = Document() #parsing feed feed = feedparser.parse(content) #if zero entries, no job appeared if len(feed.entries) <= 0: log.error( "No Entries found in %s, you sure it is RSS or atom feed?", url) return None #add static data to feed self.update_feed(feed, url) #create root tag t_results = self.doc.createElement("results") self.doc.appendChild(t_results) item_cnt = 0 timestamp = None if self.cache: timestamp = self.cache.loadFromCache(url) new_stamp = timestamp #print "timestamp ", #print new_stamp #walk over parsed feed, minning data and create final xml for _c_ in range(len(feed.entries)): t_item = self.doc.createElement("item") try: #chceck feed item publication date, save only newer items if new_stamp < feed.entries[_c_].date_parsed: new_stamp = feed.entries[_c_].date_parsed if timestamp >= feed.entries[_c_].date_parsed: continue except: pass #walking over xml tag and their values for tag, value in self.item_list.items(): path, func, req = value data = None if path is not None: try: f = feed for key in path: # _C_ as counter, mean number if key == "_C_": f = f[_c_] else: f = f[key] data = f except: pass #data postprocessing if func is not None: data = func(data) if req and data is None: log.error("Nenalezena data pro tag %s", tag) self.was_error = True if data is None: data = "" child = self.doc.createElement(tag) if isinstance(data, basestring): data = self.doc.createTextNode(toUnicode(data)) if isinstance(data, Node): child.appendChild(data) elif isinstance(data, dict): for id, data in data.items(): child.setAttribute(id, data) t_item.appendChild(child) t_results.appendChild(t_item) item_cnt += 1 if item_cnt > 0: #print "new stamp " #print new_stamp self.writeWrapper(urlparse(url).netloc.split(".")[-2]) if self.cache: self.cache.storeToCache(url, new_stamp) if self.makeBackup: writeToFile(content, "RSS_Sail_", os.path.join("rss_backup", "sail"), ".xml", timestamp=True) else: log.debug("Nothing new in feed - %s ", url) return