Esempio n. 1
0
 def analyze(self, feed_item, mapper):
     raw_feed = getDataFrom(feed_item.url, feed_item.logon, feed_item.password)
     self.full_feed = feedparser.parse(raw_feed)
     self.feed = self.full_feed.feed
     self.item = self.full_feed.entries[0]
     
     for key in mapper.funcs.keys():
         item = mapper.funcs[key]
         
         if item.text:
             item.text.found = True
             if item.text.functype == "buildin" and item.text.args != "":
                 item.text.found = False 
                 args = item.text.args.split(".")
                 if len(args) > 1:
                     val = getattr(self, args[0])
                     if val.has_key(args[1]):
                         item.text.found = True
                     
                         
                     
         for atr in item.attr:
             atr.found = True
             if atr.functype == "buildin" and atr.args != "":
                 atr.found = False
                 args = atr.args.split(".")
                 if len(args) > 1:
                     val = getattr(self, args[0])
                     if val.has_key(args[1]):
                         atr.found = True
Esempio n. 2
0
 def worker(self, job):
     job.status = "downloading"
     feed = getDataFrom(job.url, job.logon, job.password)
     writeToFile(feed, job.name, "download", extension="xml", timestamp=True)
     job.last_update = time.time()
     job.status = "idle"
     job.save()
 def get_item_text_text(self):
     link = self.getItemFromFeedItem("item.xmllink")
     xml_file = getDataFrom(urlparse.urljoin(self.dbfeed.url, urlparse.urlparse(link).path), self.dbfeed.logon, self.dbfeed.password)
     #writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True)
     tree = ElementTree.fromstring(xml_file)
     text = ""
     for node in tree.getiterator('word'):
         if node.text is not None:
             text += "".join([a+" " for a in node.text.split()])
     text = text.replace(" .",".")
     text = " ".join(text.split())
     return text
     
 def get_item_text_text(self):
     link = self.getItemFromFeedItem("item.xmllink")
     xml_file = getDataFrom(
         urlparse.urljoin(self.dbfeed.url,
                          urlparse.urlparse(link).path), self.dbfeed.logon,
         self.dbfeed.password)
     #writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True)
     tree = ElementTree.fromstring(xml_file)
     text = ""
     for node in tree.getiterator('word'):
         if node.text is not None:
             text += "".join([a + " " for a in node.text.split()])
     text = text.replace(" .", ".")
     text = " ".join(text.split())
     return text
Esempio n. 5
0
 def worker(self, job):
     try:
         job.status = "downloading"
         feed = getDataFrom(job.url, job.logon, job.password)
         if job.backup:
             writeToFile(feed, job.name, backup_folder, extension='xml', timestamp=True)
         modulename = CodeGenerator.getClassName(job)
         m = __import__(modulename)
         output = m.Downloader().parseFeed(job, feed)
         if output:
             writeToFile(output, job.name, download_folder, extension="xml", timestamp=True)
         job.status = "idle"
     except Exception, e:
         cherrypy.log("error when parsing feed id %s name %s"%(str(job.id),job.name))
         cherrypy.log(str(e))
 def update_feed(self, raw_feed):
     new_entries = []
     #print feed["entries"]
     
     timestamp = self.dbfeed.cache_data
     
     
     for e in raw_feed["entries"]:
         if timestamp >= time.mktime(e.updated_parsed):
             continue 
         time.sleep(1)
         #print e["link"]
         page = getDataFrom(e["link"], self.dbfeed.logon, self.dbfeed.password)
         if page is None:
             continue
         soup = BeautifulSoup(page)
         div = soup.find(id = 'page')
         strongs = div.findAll("strong")
         content = []
         for c in strongs:
             title = c.next
             text = c.next.next.next
             content.append([title,text])
     
         c = 0
         guid = e.id
         for item in content:
             title, text = item
             new_item = None
             new_item = copy.deepcopy(e)
             new_item["id"] = str(c)+":"+ guid
             new_item["title"] = title
             new_item["description"] = text
             new_entries.append(new_item)
             c +=1
             if "<br/>" in text:
                 print "found!!!"
                 return
         
     raw_feed["entries"] = new_entries