Ejemplo n.º 1
0
class DataProcessor:
    
    def __init__(self, hostName, directoryDetector):
        
        self.directoryDetector = directoryDetector
        self.hostName = hostName
        self.keywordFrePhone = Hash2D()
        self.keywordFreTag = Hash2D()
        self.parser = Parser()
        
        self.totalPages = 0        
        self.totalDataFoundPages = 0
        self.totalPhones = 0        
        
        self.file = open(os.path.join(self.directoryDetector.outputDirectory, self.hostName + ".db"), "w")
        self.file.write("<PROFILE>\n\t<HOST>" + self.hostName + "</HOST>\n\t<URL_LIST>\n")
        self.file.flush()
        
        self.isPrintLog = True
    
    def printLog(self, logMsg):
        
        if self.isPrintLog:
            print logMsg

    def getStatus(self, dataFoundURLStByte, dataFoundURLEnByte):
                       
        strData = ""

        strKeyPhone, strFrePhone = self.keywordFrePhone.get()
        strKeyTag, strFreTag = self.keywordFreTag.get()
        
        self.totalPhones += self.keywordFrePhone.size() + self.keywordFreTag.size()
        
        strData = "<PROFILE>"
        strData += "\n\t<URL>" + self.hostName + "</URL>"
        strData += "\n\t<INPUT_DIRECTORY>" + self.directoryDetector.inputDirectory + "</INPUT_DIRECTORY>"
        strData += "\n\t<OUTPUT_DIRECTORY>" + self.directoryDetector.outputDirectory + "</OUTPUT_DIRECTORY>"
        strData += "\n\t<TOTAL_PAGES>" + str(self.totalPages) + "</TOTAL_PAGES>"
        strData += "\n\t<TOTAL_PAGES_DATA_FOUND>" + str(self.totalDataFoundPages) + "</TOTAL_PAGES_DATA_FOUND>"
        strData += "\n\t<START_BYTE>" + str(dataFoundURLStByte) + "</START_BYTE>"
        strData += "\n\t<END_BYTE>" + str(dataFoundURLEnByte) + "</END_BYTE>"
        strData += "\n\t<KEYWORD_INFO>"
        strData += "\n\t\t<KEYWORDS>" + strKeyPhone + "</KEYWORDS>"
        strData += "\n\t\t<FREQUENCIES>" + strFrePhone + "</FREQUENCIES>"
        strData += "\n\t</KEYWORD_INFO>"
        strData += "\n\t<KEYWORDS_IN_TAG>"
        strData += "\n\t\t<TAG_KEYWORDS>" + strKeyTag + "</TAG_KEYWORDS>"
        strData += "\n\t\t<FREQUENCIES_TAG_KEY>" + strFreTag + "</FREQUENCIES_TAG_KEY>"
        strData += "\n\t</KEYWORDS_IN_TAG>"
        strData += "\n\t<PHONE>" + str(self.totalPhones) + "</PHONE>"
        strData += "\n</PROFILE>\n"
        
        return strData        
        
    def process(self, thisHostName, thisURL, thisContent, urlInfo):
        
        if urlInfo.isValid == "0":
            return
        
        self.totalPages += 1
                
        self.parser.clear()
        self.parser.parseData(thisContent, urlInfo.contentEndByte - urlInfo.contentStartByte + 1)
        
        cntPhone = self.parser.keywordFrePhone.size()
        cntPhoneTag = self.parser.keywordFreTag.size()
        
        #if cntPhone != 0 or cntPhoneTag != 0:
        #    self.totalDataFoundPages += 1           
            
        totalPhones = cntPhone + cntPhoneTag
        
        if totalPhones == 0 :
            return
        
        totalPhoneAfterMerge = 0
        
        totalPhoneAfterMerge = self.keywordFreTag.merge(self.parser.keywordFreTag)
        totalPhoneAfterMerge += self.keywordFrePhone.merge(self.parser.keywordFrePhone)
        
        if totalPhoneAfterMerge == 0 and totalPhones == 1:
            return
        
        strKeyPhone, strFrePhone = self.parser.keywordFrePhone.get()
        strKeyTag, strFreTag = self.parser.keywordFreTag.get()
                
        strByteInfo = str(urlInfo.contentNo) + " " + str(urlInfo.urlStartByte) + " " + str(urlInfo.urlEndByte) + " " + str(urlInfo.contentStartByte) + " " + str(urlInfo.contentEndByte) + " " + str(urlInfo.isValid) 


        self.totalDataFoundPages += 1
        
        strData = "\n\t\t<URL_INFO>"
        strData += "\n\t\t\t<URL>" + thisURL + "</URL>"
        if len(urlInfo.crawlTime) > 0:
            strData += "\n\t\t\t<CRAWL_TIME>" + urlInfo.crawlTime + "</CRAWL_TIME>"
        strData += "\n\t\t\t<KEYWORD_INFO>"
        strData += "\n\t\t\t\t<KEYWORDS>" + strKeyPhone + "</KEYWORDS>"
        strData += "\n\t\t\t\t<FREQUENCIES>" + strFrePhone + "</FREQUENCIES>"
        strData += "\n\t\t\t</KEYWORD_INFO>"
        strData += "\n\t\t\t<KEYWORD_IN_TAG>"
        strData += "\n\t\t\t\t<KEYWORDS>" + strKeyTag + "</KEYWORDS>"
        strData += "\n\t\t\t\t<FREQUENCIES>" + strFreTag + "</FREQUENCIES>"
        strData += "\n\t\t\t</KEYWORD_IN_TAG>"
        strData += "\n\t\t\t<BYTE_INFO>" + strByteInfo + "</BYTE_INFO>"
        strData += "\n\t\t\t<PHONE>" + str(totalPhones) + "</PHONE>"
        strData += "\n\t\t</URL_INFO>\n"

        self.file.write(strData)
        self.file.flush()