예제 #1
0
    def createXmlandWrite(self, dataToStore, filename):
        if not dataToStore:
            return 
        
        doc = Document()
        results = doc.createElement("results")
        doc.appendChild(results)
        for a in dataToStore:
            item = doc.createElement("item")
            for k, d in a.items():
                tag = doc.createElement(k)
                
                if k == "author":
                    tag2 = doc.createElement("name")
                    data = doc.createTextNode(d)
                    tag2.appendChild(data)
                    tag.appendChild(tag2)
                else:
                    data = doc.createTextNode(toUnicode(d))

                    tag.appendChild(data)
                item.appendChild(tag)
            results.appendChild(item)
        
        qwe = doc.toprettyxml(indent="    ", encoding = "UTF-8")
       

        writeToFile(qwe, 
                    "scf_"+filename, 
                    "/mnt/minerva1/nlp/projects/spinn3r/solr_data/xjerab13",
			#"/mnt/minerva1/nlp/projects/blogs_download2/sail_data_out",
			#"e:\\tmp\\tmp\\conv", 
                    extension=".xml", 
                    timestamp=True
                    )
예제 #2
0
    def createXmlandWrite(self, dataToStore, filename):
        if not dataToStore:
            return

        doc = Document()
        results = doc.createElement("results")
        doc.appendChild(results)
        for a in dataToStore:
            item = doc.createElement("item")
            for k, d in a.items():
                tag = doc.createElement(k)

                if k == "author":
                    tag2 = doc.createElement("name")
                    data = doc.createTextNode(d)
                    tag2.appendChild(data)
                    tag.appendChild(tag2)
                else:
                    data = doc.createTextNode(toUnicode(d))

                    tag.appendChild(data)
                item.appendChild(tag)
            results.appendChild(item)

        qwe = doc.toprettyxml(indent="    ", encoding="UTF-8")

        writeToFile(
            qwe,
            "scf_" + filename,
            "/mnt/minerva1/nlp/projects/spinn3r/solr_data/xjerab13",
            #"/mnt/minerva1/nlp/projects/blogs_download2/sail_data_out",
            #"e:\\tmp\\tmp\\conv",
            extension=".xml",
            timestamp=True)
예제 #3
0
    def process(self, url):
        '''Download feed, process and create output'''
        content = getDataFrom(url, self.username, self.password)
        self.was_error = False
        self.doc = Document()
        #parsing feed
        feed = feedparser.parse(content)
        #if zero entries, no job appeared
        if len(feed.entries) <= 0:
            log.error("No Entries found in %s, you sure it is RSS or atom feed?", url)
            return None;
        
        #add static data to feed
        self.update_feed(feed , url)
        
        #create root tag
        t_results = self.doc.createElement("results")
        self.doc.appendChild(t_results)
        item_cnt = 0
        
        timestamp = None
        if self.cache:
            timestamp = self.cache.loadFromCache(url)
        new_stamp = timestamp
                
        #print "timestamp ",
        #print new_stamp
            
        #walk over parsed feed, minning data and create final xml
        for _c_ in range(len(feed.entries)):
            t_item = self.doc.createElement("item")
            try:
                #chceck feed item publication date, save only newer items
                if new_stamp < feed.entries[_c_].date_parsed:
                    new_stamp = feed.entries[_c_].date_parsed
                if timestamp >= feed.entries[_c_].date_parsed:
                    continue
            except:
                pass
            
            #walking over xml tag and their values
            for tag,value in self.item_list.items():
                path, func, req = value
                data = None
                
                if path is not None:
                    try:
                        f = feed
                        for key in path:
                            # _C_ as counter, mean number
                            if key == "_C_":
                                f = f[_c_]
                            else:
                                f = f[key]
                        data = f
                    except:
                        pass
                        

                #data postprocessing 
                if func is not None:
                    data = func(data) 
                    
                if req and data is None:
                    log.error("Nenalezena data pro tag %s", tag)
                    self.was_error = True
                if data is None:    
                    data = ""
                
                child = self.doc.createElement(tag) 
                 
                if isinstance(data, basestring):
                    data = self.doc.createTextNode(toUnicode(data))
                    
                if isinstance(data, Node):
                    child.appendChild(data)
                elif isinstance(data, dict):
                    for id, data in data.items():
                        child.setAttribute(id,data)
                     

                
                t_item.appendChild(child)
                t_results.appendChild(t_item)
            item_cnt += 1
        
        if item_cnt > 0:
            #print "new stamp "
            #print new_stamp
            self.writeWrapper(urlparse(url).netloc.split(".")[-2])
            if self.cache:
                self.cache.storeToCache(url,new_stamp)
            if self.makeBackup:
                writeToFile(content, "RSS_Sail_", os.path.join("rss_backup","sail"), ".xml", timestamp=True)
            
        else:
            log.debug("Nothing new in feed - %s " , url)
            return 
예제 #4
0
    def process(self, url):
        '''Download feed, process and create output'''
        content = getDataFrom(url, self.username, self.password)
        self.was_error = False
        self.doc = Document()
        #parsing feed
        feed = feedparser.parse(content)
        #if zero entries, no job appeared
        if len(feed.entries) <= 0:
            log.error(
                "No Entries found in %s, you sure it is RSS or atom feed?",
                url)
            return None

        #add static data to feed
        self.update_feed(feed, url)

        #create root tag
        t_results = self.doc.createElement("results")
        self.doc.appendChild(t_results)
        item_cnt = 0

        timestamp = None
        if self.cache:
            timestamp = self.cache.loadFromCache(url)
        new_stamp = timestamp

        #print "timestamp ",
        #print new_stamp

        #walk over parsed feed, minning data and create final xml
        for _c_ in range(len(feed.entries)):
            t_item = self.doc.createElement("item")
            try:
                #chceck feed item publication date, save only newer items
                if new_stamp < feed.entries[_c_].date_parsed:
                    new_stamp = feed.entries[_c_].date_parsed
                if timestamp >= feed.entries[_c_].date_parsed:
                    continue
            except:
                pass

            #walking over xml tag and their values
            for tag, value in self.item_list.items():
                path, func, req = value
                data = None

                if path is not None:
                    try:
                        f = feed
                        for key in path:
                            # _C_ as counter, mean number
                            if key == "_C_":
                                f = f[_c_]
                            else:
                                f = f[key]
                        data = f
                    except:
                        pass

                #data postprocessing
                if func is not None:
                    data = func(data)

                if req and data is None:
                    log.error("Nenalezena data pro tag %s", tag)
                    self.was_error = True
                if data is None:
                    data = ""

                child = self.doc.createElement(tag)

                if isinstance(data, basestring):
                    data = self.doc.createTextNode(toUnicode(data))

                if isinstance(data, Node):
                    child.appendChild(data)
                elif isinstance(data, dict):
                    for id, data in data.items():
                        child.setAttribute(id, data)

                t_item.appendChild(child)
                t_results.appendChild(t_item)
            item_cnt += 1

        if item_cnt > 0:
            #print "new stamp "
            #print new_stamp
            self.writeWrapper(urlparse(url).netloc.split(".")[-2])
            if self.cache:
                self.cache.storeToCache(url, new_stamp)
            if self.makeBackup:
                writeToFile(content,
                            "RSS_Sail_",
                            os.path.join("rss_backup", "sail"),
                            ".xml",
                            timestamp=True)

        else:
            log.debug("Nothing new in feed - %s ", url)
            return