예제 #1
0
 def text_creator(self, link):
     xml_file = getDataFrom(urljoin(self.handle_url, urlparse(link).path), self.username, self.password)
     writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True)
     tree = ElementTree.fromstring(xml_file)
     text = ""
     for node in tree.getiterator('word'):
         if node.text is not None:
             text += "".join([a+" " for a in node.text.split()])
     
     text = text.replace(" .",".")
     text = " ".join(text.split())
     return text
예제 #2
0
    def update_feed(self, feed, url):
        '''function modify feed dict'''
        super(RSS_XmlGeneratorDrfruehwein,self).update_feed(feed, url)
        new_entries = []
        #print feed["entries"]
        
        timestamp = None
        
        if self.cache:
            timestamp = self.cache.loadFromCache(self.handle_url)

        
        for e in feed["entries"]:
            if timestamp >= e.date_parsed:
                    continue 
            sleep(1)
            page = getDataFrom(e["link"], self.username, self.password)
            if page is None:
                continue
            soup = BeautifulSoup(page)
            div = soup.find(id = 'page')
            strongs = div.findAll("strong")
            content = []
            for c in strongs:
                title = c.next
                text = c.next.next.next
                content.append([title,text])
                
     
            
            c = 0
            guid = e.id
            for item in content:
                title, text = item
                new_item = None
                new_item = copy.deepcopy(e)
                new_item["id"] = str(c)+":"+ guid
                new_item["title"] = title
                new_item["description"] = text
                new_entries.append(new_item)
                c +=1
        feed["entries"] = new_entries
        return
        
예제 #3
0
 def processSection(self, bsection):
     '''process all pages in forum section'''
     actualPage = 1
     sectionLenght = None
     url = urljoin(self._url,bsection)
     while(True):
         try:
             urll = url + "/" + str(actualPage) 
             #urll = url + "/" + str(151)
             page = getDataFrom(urll, None, None)
 
             #print "PS for " + url
             if page is None and sectionLenght is None:
                 log.debug("none data, return")
                 return
             elif page is None and sectionLenght != None:
                 log.debug("none data, continue")
                 continue
             
             pageParser = PageParser(page)
             if not sectionLenght:
                 #get max page in section
                 sectionLenght = pageParser.getSectionLenght()
                 #print "sectionLenght is" + str(sectionLenght)
                 log.debug("sectionLenght is %s" , str(sectionLenght)) 
                 
             itemList = pageParser.getListOfItems()
             
             
             self.processData(itemList)
             #add stacionary data
             self.addItemToFinalList("source", "http://board.netdoktor.de/")
             self.addItemToFinalList("section", "netdoktor")
             self.addItemToFinalList("lang", "de")
             #SAVE!!!
             self.createXmlandWrite(name = bsection.replace("/","_"))
         except Exception,e:
             log.critical("%s",e)
             log.exception("Some exception in process section")
             
         if actualPage >= sectionLenght:
             return
         actualPage += 1
예제 #4
0
 def updateContent(self, url):
     '''if new entries in article found, they are processing and add to list to save'''
     #print url[25:]
     #pbar(url[25:])
     #print ".",
     #log.debug("UPDATING data from - %s",url)
     page = getDataFrom(url, None, None)
     
     if page == None:
         return False
     
     pageParser = PageParser(page)
     notCached = pageParser.getEntriesList()
     #print notCached
     eToStore = self.pageCache.cacheAndReturnNew(url, notCached)
     
     for i in range(len(eToStore)):
         eToStore[i]["link"] = url
     #print "adding" + str(len(eToStore))  
     log.debug("ADD %d new entries", len(eToStore))
     
     self.dataToStore.extend(eToStore)
            
     return True
예제 #5
0
    def updateContent(self, url):
        '''if new entries in article found, they are processing and add to list to save'''
        #print url[25:]
        #pbar(url[25:])
        #print ".",
        #log.debug("UPDATING data from - %s",url)
        page = getDataFrom(url, None, None)

        if page == None:
            return False

        pageParser = PageParser(page)
        notCached = pageParser.getEntriesList()
        #print notCached
        eToStore = self.pageCache.cacheAndReturnNew(url, notCached)

        for i in range(len(eToStore)):
            eToStore[i]["link"] = url
        #print "adding" + str(len(eToStore))
        log.debug("ADD %d new entries", len(eToStore))

        self.dataToStore.extend(eToStore)

        return True
예제 #6
0
    def process(self, url):
        '''Download feed, process and create output'''
        content = getDataFrom(url, self.username, self.password)
        self.was_error = False
        self.doc = Document()
        #parsing feed
        feed = feedparser.parse(content)
        #if zero entries, no job appeared
        if len(feed.entries) <= 0:
            log.error("No Entries found in %s, you sure it is RSS or atom feed?", url)
            return None;
        
        #add static data to feed
        self.update_feed(feed , url)
        
        #create root tag
        t_results = self.doc.createElement("results")
        self.doc.appendChild(t_results)
        item_cnt = 0
        
        timestamp = None
        if self.cache:
            timestamp = self.cache.loadFromCache(url)
        new_stamp = timestamp
                
        #print "timestamp ",
        #print new_stamp
            
        #walk over parsed feed, minning data and create final xml
        for _c_ in range(len(feed.entries)):
            t_item = self.doc.createElement("item")
            try:
                #chceck feed item publication date, save only newer items
                if new_stamp < feed.entries[_c_].date_parsed:
                    new_stamp = feed.entries[_c_].date_parsed
                if timestamp >= feed.entries[_c_].date_parsed:
                    continue
            except:
                pass
            
            #walking over xml tag and their values
            for tag,value in self.item_list.items():
                path, func, req = value
                data = None
                
                if path is not None:
                    try:
                        f = feed
                        for key in path:
                            # _C_ as counter, mean number
                            if key == "_C_":
                                f = f[_c_]
                            else:
                                f = f[key]
                        data = f
                    except:
                        pass
                        

                #data postprocessing 
                if func is not None:
                    data = func(data) 
                    
                if req and data is None:
                    log.error("Nenalezena data pro tag %s", tag)
                    self.was_error = True
                if data is None:    
                    data = ""
                
                child = self.doc.createElement(tag) 
                 
                if isinstance(data, basestring):
                    data = self.doc.createTextNode(toUnicode(data))
                    
                if isinstance(data, Node):
                    child.appendChild(data)
                elif isinstance(data, dict):
                    for id, data in data.items():
                        child.setAttribute(id,data)
                     

                
                t_item.appendChild(child)
                t_results.appendChild(t_item)
            item_cnt += 1
        
        if item_cnt > 0:
            #print "new stamp "
            #print new_stamp
            self.writeWrapper(urlparse(url).netloc.split(".")[-2])
            if self.cache:
                self.cache.storeToCache(url,new_stamp)
            if self.makeBackup:
                writeToFile(content, "RSS_Sail_", os.path.join("rss_backup","sail"), ".xml", timestamp=True)
            
        else:
            log.debug("Nothing new in feed - %s " , url)
            return 
예제 #7
0
    def process(self, url):
        '''Download feed, process and create output'''
        content = getDataFrom(url, self.username, self.password)
        self.was_error = False
        self.doc = Document()
        #parsing feed
        feed = feedparser.parse(content)
        #if zero entries, no job appeared
        if len(feed.entries) <= 0:
            log.error(
                "No Entries found in %s, you sure it is RSS or atom feed?",
                url)
            return None

        #add static data to feed
        self.update_feed(feed, url)

        #create root tag
        t_results = self.doc.createElement("results")
        self.doc.appendChild(t_results)
        item_cnt = 0

        timestamp = None
        if self.cache:
            timestamp = self.cache.loadFromCache(url)
        new_stamp = timestamp

        #print "timestamp ",
        #print new_stamp

        #walk over parsed feed, minning data and create final xml
        for _c_ in range(len(feed.entries)):
            t_item = self.doc.createElement("item")
            try:
                #chceck feed item publication date, save only newer items
                if new_stamp < feed.entries[_c_].date_parsed:
                    new_stamp = feed.entries[_c_].date_parsed
                if timestamp >= feed.entries[_c_].date_parsed:
                    continue
            except:
                pass

            #walking over xml tag and their values
            for tag, value in self.item_list.items():
                path, func, req = value
                data = None

                if path is not None:
                    try:
                        f = feed
                        for key in path:
                            # _C_ as counter, mean number
                            if key == "_C_":
                                f = f[_c_]
                            else:
                                f = f[key]
                        data = f
                    except:
                        pass

                #data postprocessing
                if func is not None:
                    data = func(data)

                if req and data is None:
                    log.error("Nenalezena data pro tag %s", tag)
                    self.was_error = True
                if data is None:
                    data = ""

                child = self.doc.createElement(tag)

                if isinstance(data, basestring):
                    data = self.doc.createTextNode(toUnicode(data))

                if isinstance(data, Node):
                    child.appendChild(data)
                elif isinstance(data, dict):
                    for id, data in data.items():
                        child.setAttribute(id, data)

                t_item.appendChild(child)
                t_results.appendChild(t_item)
            item_cnt += 1

        if item_cnt > 0:
            #print "new stamp "
            #print new_stamp
            self.writeWrapper(urlparse(url).netloc.split(".")[-2])
            if self.cache:
                self.cache.storeToCache(url, new_stamp)
            if self.makeBackup:
                writeToFile(content,
                            "RSS_Sail_",
                            os.path.join("rss_backup", "sail"),
                            ".xml",
                            timestamp=True)

        else:
            log.debug("Nothing new in feed - %s ", url)
            return