def text_creator(self, link): xml_file = getDataFrom(urljoin(self.handle_url, urlparse(link).path), self.username, self.password) writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True) tree = ElementTree.fromstring(xml_file) text = "" for node in tree.getiterator('word'): if node.text is not None: text += "".join([a+" " for a in node.text.split()]) text = text.replace(" .",".") text = " ".join(text.split()) return text
def update_feed(self, feed, url): '''function modify feed dict''' super(RSS_XmlGeneratorDrfruehwein,self).update_feed(feed, url) new_entries = [] #print feed["entries"] timestamp = None if self.cache: timestamp = self.cache.loadFromCache(self.handle_url) for e in feed["entries"]: if timestamp >= e.date_parsed: continue sleep(1) page = getDataFrom(e["link"], self.username, self.password) if page is None: continue soup = BeautifulSoup(page) div = soup.find(id = 'page') strongs = div.findAll("strong") content = [] for c in strongs: title = c.next text = c.next.next.next content.append([title,text]) c = 0 guid = e.id for item in content: title, text = item new_item = None new_item = copy.deepcopy(e) new_item["id"] = str(c)+":"+ guid new_item["title"] = title new_item["description"] = text new_entries.append(new_item) c +=1 feed["entries"] = new_entries return
def processSection(self, bsection): '''process all pages in forum section''' actualPage = 1 sectionLenght = None url = urljoin(self._url,bsection) while(True): try: urll = url + "/" + str(actualPage) #urll = url + "/" + str(151) page = getDataFrom(urll, None, None) #print "PS for " + url if page is None and sectionLenght is None: log.debug("none data, return") return elif page is None and sectionLenght != None: log.debug("none data, continue") continue pageParser = PageParser(page) if not sectionLenght: #get max page in section sectionLenght = pageParser.getSectionLenght() #print "sectionLenght is" + str(sectionLenght) log.debug("sectionLenght is %s" , str(sectionLenght)) itemList = pageParser.getListOfItems() self.processData(itemList) #add stacionary data self.addItemToFinalList("source", "http://board.netdoktor.de/") self.addItemToFinalList("section", "netdoktor") self.addItemToFinalList("lang", "de") #SAVE!!! self.createXmlandWrite(name = bsection.replace("/","_")) except Exception,e: log.critical("%s",e) log.exception("Some exception in process section") if actualPage >= sectionLenght: return actualPage += 1
def updateContent(self, url): '''if new entries in article found, they are processing and add to list to save''' #print url[25:] #pbar(url[25:]) #print ".", #log.debug("UPDATING data from - %s",url) page = getDataFrom(url, None, None) if page == None: return False pageParser = PageParser(page) notCached = pageParser.getEntriesList() #print notCached eToStore = self.pageCache.cacheAndReturnNew(url, notCached) for i in range(len(eToStore)): eToStore[i]["link"] = url #print "adding" + str(len(eToStore)) log.debug("ADD %d new entries", len(eToStore)) self.dataToStore.extend(eToStore) return True
def process(self, url): '''Download feed, process and create output''' content = getDataFrom(url, self.username, self.password) self.was_error = False self.doc = Document() #parsing feed feed = feedparser.parse(content) #if zero entries, no job appeared if len(feed.entries) <= 0: log.error("No Entries found in %s, you sure it is RSS or atom feed?", url) return None; #add static data to feed self.update_feed(feed , url) #create root tag t_results = self.doc.createElement("results") self.doc.appendChild(t_results) item_cnt = 0 timestamp = None if self.cache: timestamp = self.cache.loadFromCache(url) new_stamp = timestamp #print "timestamp ", #print new_stamp #walk over parsed feed, minning data and create final xml for _c_ in range(len(feed.entries)): t_item = self.doc.createElement("item") try: #chceck feed item publication date, save only newer items if new_stamp < feed.entries[_c_].date_parsed: new_stamp = feed.entries[_c_].date_parsed if timestamp >= feed.entries[_c_].date_parsed: continue except: pass #walking over xml tag and their values for tag,value in self.item_list.items(): path, func, req = value data = None if path is not None: try: f = feed for key in path: # _C_ as counter, mean number if key == "_C_": f = f[_c_] else: f = f[key] data = f except: pass #data postprocessing if func is not None: data = func(data) if req and data is None: log.error("Nenalezena data pro tag %s", tag) self.was_error = True if data is None: data = "" child = self.doc.createElement(tag) if isinstance(data, basestring): data = self.doc.createTextNode(toUnicode(data)) if isinstance(data, Node): child.appendChild(data) elif isinstance(data, dict): for id, data in data.items(): child.setAttribute(id,data) t_item.appendChild(child) t_results.appendChild(t_item) item_cnt += 1 if item_cnt > 0: #print "new stamp " #print new_stamp self.writeWrapper(urlparse(url).netloc.split(".")[-2]) if self.cache: self.cache.storeToCache(url,new_stamp) if self.makeBackup: writeToFile(content, "RSS_Sail_", os.path.join("rss_backup","sail"), ".xml", timestamp=True) else: log.debug("Nothing new in feed - %s " , url) return
def process(self, url): '''Download feed, process and create output''' content = getDataFrom(url, self.username, self.password) self.was_error = False self.doc = Document() #parsing feed feed = feedparser.parse(content) #if zero entries, no job appeared if len(feed.entries) <= 0: log.error( "No Entries found in %s, you sure it is RSS or atom feed?", url) return None #add static data to feed self.update_feed(feed, url) #create root tag t_results = self.doc.createElement("results") self.doc.appendChild(t_results) item_cnt = 0 timestamp = None if self.cache: timestamp = self.cache.loadFromCache(url) new_stamp = timestamp #print "timestamp ", #print new_stamp #walk over parsed feed, minning data and create final xml for _c_ in range(len(feed.entries)): t_item = self.doc.createElement("item") try: #chceck feed item publication date, save only newer items if new_stamp < feed.entries[_c_].date_parsed: new_stamp = feed.entries[_c_].date_parsed if timestamp >= feed.entries[_c_].date_parsed: continue except: pass #walking over xml tag and their values for tag, value in self.item_list.items(): path, func, req = value data = None if path is not None: try: f = feed for key in path: # _C_ as counter, mean number if key == "_C_": f = f[_c_] else: f = f[key] data = f except: pass #data postprocessing if func is not None: data = func(data) if req and data is None: log.error("Nenalezena data pro tag %s", tag) self.was_error = True if data is None: data = "" child = self.doc.createElement(tag) if isinstance(data, basestring): data = self.doc.createTextNode(toUnicode(data)) if isinstance(data, Node): child.appendChild(data) elif isinstance(data, dict): for id, data in data.items(): child.setAttribute(id, data) t_item.appendChild(child) t_results.appendChild(t_item) item_cnt += 1 if item_cnt > 0: #print "new stamp " #print new_stamp self.writeWrapper(urlparse(url).netloc.split(".")[-2]) if self.cache: self.cache.storeToCache(url, new_stamp) if self.makeBackup: writeToFile(content, "RSS_Sail_", os.path.join("rss_backup", "sail"), ".xml", timestamp=True) else: log.debug("Nothing new in feed - %s ", url) return