def analyze(self, feed_item, mapper): raw_feed = getDataFrom(feed_item.url, feed_item.logon, feed_item.password) self.full_feed = feedparser.parse(raw_feed) self.feed = self.full_feed.feed self.item = self.full_feed.entries[0] for key in mapper.funcs.keys(): item = mapper.funcs[key] if item.text: item.text.found = True if item.text.functype == "buildin" and item.text.args != "": item.text.found = False args = item.text.args.split(".") if len(args) > 1: val = getattr(self, args[0]) if val.has_key(args[1]): item.text.found = True for atr in item.attr: atr.found = True if atr.functype == "buildin" and atr.args != "": atr.found = False args = atr.args.split(".") if len(args) > 1: val = getattr(self, args[0]) if val.has_key(args[1]): atr.found = True
def worker(self, job): job.status = "downloading" feed = getDataFrom(job.url, job.logon, job.password) writeToFile(feed, job.name, "download", extension="xml", timestamp=True) job.last_update = time.time() job.status = "idle" job.save()
def get_item_text_text(self): link = self.getItemFromFeedItem("item.xmllink") xml_file = getDataFrom(urlparse.urljoin(self.dbfeed.url, urlparse.urlparse(link).path), self.dbfeed.logon, self.dbfeed.password) #writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True) tree = ElementTree.fromstring(xml_file) text = "" for node in tree.getiterator('word'): if node.text is not None: text += "".join([a+" " for a in node.text.split()]) text = text.replace(" .",".") text = " ".join(text.split()) return text
def get_item_text_text(self): link = self.getItemFromFeedItem("item.xmllink") xml_file = getDataFrom( urlparse.urljoin(self.dbfeed.url, urlparse.urlparse(link).path), self.dbfeed.logon, self.dbfeed.password) #writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True) tree = ElementTree.fromstring(xml_file) text = "" for node in tree.getiterator('word'): if node.text is not None: text += "".join([a + " " for a in node.text.split()]) text = text.replace(" .", ".") text = " ".join(text.split()) return text
def worker(self, job): try: job.status = "downloading" feed = getDataFrom(job.url, job.logon, job.password) if job.backup: writeToFile(feed, job.name, backup_folder, extension='xml', timestamp=True) modulename = CodeGenerator.getClassName(job) m = __import__(modulename) output = m.Downloader().parseFeed(job, feed) if output: writeToFile(output, job.name, download_folder, extension="xml", timestamp=True) job.status = "idle" except Exception, e: cherrypy.log("error when parsing feed id %s name %s"%(str(job.id),job.name)) cherrypy.log(str(e))
def update_feed(self, raw_feed): new_entries = [] #print feed["entries"] timestamp = self.dbfeed.cache_data for e in raw_feed["entries"]: if timestamp >= time.mktime(e.updated_parsed): continue time.sleep(1) #print e["link"] page = getDataFrom(e["link"], self.dbfeed.logon, self.dbfeed.password) if page is None: continue soup = BeautifulSoup(page) div = soup.find(id = 'page') strongs = div.findAll("strong") content = [] for c in strongs: title = c.next text = c.next.next.next content.append([title,text]) c = 0 guid = e.id for item in content: title, text = item new_item = None new_item = copy.deepcopy(e) new_item["id"] = str(c)+":"+ guid new_item["title"] = title new_item["description"] = text new_entries.append(new_item) c +=1 if "<br/>" in text: print "found!!!" return raw_feed["entries"] = new_entries