예제 #1
0
    def __init__(self, documentName="currentWebsite", urlString=None,\
                 indicators=('http://', 'www', '.com', '.co.uk')):
        tfe.__init__(self, documentName, indicators)

        self.website = None
        self.scraper = WebsiteScraper(documentName=self.documentName,
                                      startScrapyScan=False)
        self.foundWebsite = False
        self.numOfURLs = None
        self.htmlResponse = None

        if urlString is not None:
            self.scrapeWebsiteFromURL(urlString)
예제 #2
0
 def __init__(self, documentName="currentWebsite", urlString=None,\
              indicators=('http://', 'www', '.com', '.co.uk')):
         tfe.__init__(self, documentName, indicators)
         
         self.website = None
         self.scraper = WebsiteScraper(documentName=self.documentName, startScrapyScan=False)
         self.foundWebsite = False
         self.numOfURLs = None
         self.htmlResponse = None
         
         if urlString is not None:
                 self.scrapeWebsiteFromURL(urlString)
예제 #3
0
class GeneralFeatureExtractor(tfe):  #Text and HTML (via Scrapy)
    def __init__(self, documentName="currentWebsite", urlString=None,\
                 indicators=('http://', 'www', '.com', '.co.uk')):
        tfe.__init__(self, documentName, indicators)

        self.website = None
        self.scraper = WebsiteScraper(documentName=self.documentName,
                                      startScrapyScan=False)
        self.foundWebsite = False
        self.numOfURLs = None
        self.htmlResponse = None

        if urlString is not None:
            self.scrapeWebsiteFromURL(urlString)

    def scrapeWebsiteFromURL(self, urlString, documentName=None):
        domainList, urlList = self.htmlParser.getURLsWithDomains(urlString)

        if documentName is not None:
            self.documentName = documentName
        self.website = self.scraper.startCrawler(domainList,\
                urlList, self.documentName)

        if self.website != None:  #If website was found...
            self.foundWebsite = True
            self.numOfURLs = float(
                len(self.htmlParser.getWebsiteURLs(self.website)))
            self.htmlResponse = self.htmlParser.getResponseAttribute(
                self.website, 'all')

    def lengthOfWebsiteBodyText(self, textString):
        if self.foundWebsite:
            bodyLength = len(
                self.htmlParser.getResponseAttribute(self.website, 'body'))
            responseLength = len(self.htmlResponse)
            return float(bodyLength) / responseLength
        else:
            return 0

    def numOfURLsInWebsite(self, textString):
        if self.foundWebsite:
            maxURLCount = 30  #Unsure of how many URLs exist in the document, so not perfect
            return self.numOfURLs / maxURLCount
        else:
            return 0

    def proportionOfUniqueTagsInWebsite(self, textString):
        if self.foundWebsite:
            maxTagCount = 100  #Unsure of how many HTML tags exist, so not perfect
            return float(
                len(self.htmlParser.getTagCounter(
                    self.website).keys())) / maxTagCount
        else:
            return 0

    def tagDiversity(self, textString):
        if self.foundWebsite:
            numOfTags = len(self.htmlParser.getTagsFromItem(self.website))
            return float(
                len(self.htmlParser.getTagCounter(
                    self.website).keys())) / numOfTags
        else:
            return 0

    def containsFormTag(self, textString):
        if self.foundWebsite:
            if len(self.htmlParser.getFormTags(self.htmlResponse)) > 0:
                return 1
        return 0

    def containsJavascript(self, textString):
        if self.foundWebsite:
            if (('function' and '(){') or\
                '<script>' or '</script>') in self.htmlParser.getTagsFromItem(self.website):
                return 1
        return 0

    def getNumberOfDomains(self, textString):
        if self.foundWebsite:
            if self.numOfURLs == 0:
                return 0
            else:
                return float(
                    self.htmlParser.getNumberOfDomains(
                        self.htmlResponse)) / self.numOfURLs
        return 0

    def getNumberOfSubDomains(self, textString):
        if self.foundWebsite:
            if self.numOfURLs == 0:
                return 0
            else:
                return float(
                    self.htmlParser.getNumberOfSubDomains(
                        self.htmlResponse)) / self.numOfURLs
        return 0
예제 #4
0
class GeneralFeatureExtractor(tfe): #Text and HTML (via Scrapy)
        def __init__(self, documentName="currentWebsite", urlString=None,\
                     indicators=('http://', 'www', '.com', '.co.uk')):
                tfe.__init__(self, documentName, indicators)
                
                self.website = None
                self.scraper = WebsiteScraper(documentName=self.documentName, startScrapyScan=False)
                self.foundWebsite = False
                self.numOfURLs = None
                self.htmlResponse = None
                
                if urlString is not None:
                        self.scrapeWebsiteFromURL(urlString)
                        

        def scrapeWebsiteFromURL(self, urlString, documentName=None):
                domainList, urlList = self.htmlParser.getURLsWithDomains(urlString)

                if documentName is not None:
                        self.documentName = documentName           
                self.website = self.scraper.startCrawler(domainList,\
                        urlList, self.documentName)
                
                if self.website != None: #If website was found...
                        self.foundWebsite = True
                        self.numOfURLs = float(len(self.htmlParser.getWebsiteURLs(self.website)))
                        self.htmlResponse = self.htmlParser.getResponseAttribute(self.website, 'all')
                

        def lengthOfWebsiteBodyText(self, textString):
                if self.foundWebsite:
                        bodyLength = len(self.htmlParser.getResponseAttribute(self.website, 'body'))
                        responseLength = len(self.htmlResponse)
                        return float(bodyLength)/responseLength
                else:
                        return 0

        def numOfURLsInWebsite(self, textString):
                if self.foundWebsite:
                        maxURLCount = 30 #Unsure of how many URLs exist in the document, so not perfect
                        return self.numOfURLs/maxURLCount
                else:
                        return 0

        def proportionOfUniqueTagsInWebsite(self, textString):
                if self.foundWebsite:
                        maxTagCount = 100 #Unsure of how many HTML tags exist, so not perfect
                        return float(len(self.htmlParser.getTagCounter(self.website).keys()))/maxTagCount       
                else:
                        return 0

        def tagDiversity(self, textString):
                if self.foundWebsite:
                        numOfTags = len(self.htmlParser.getTagsFromItem(self.website))
                        return float(len(self.htmlParser.getTagCounter(self.website).keys()))/numOfTags       
                else:
                        return 0

        def containsFormTag(self, textString):
                if self.foundWebsite:
                        if len(self.htmlParser.getFormTags(self.htmlResponse)) > 0:
                                return 1
                return 0

        def containsJavascript(self, textString):
                if self.foundWebsite:
                        if (('function' and '(){') or\
                            '<script>' or '</script>') in self.htmlParser.getTagsFromItem(self.website):
                                return 1
                return 0

        def getNumberOfDomains(self, textString):
                if self.foundWebsite:
                        if self.numOfURLs == 0:
                                return 0
                        else:
                                return float(self.htmlParser.getNumberOfDomains(self.htmlResponse))/self.numOfURLs
                return 0

        def getNumberOfSubDomains(self, textString):
                if self.foundWebsite:
                        if self.numOfURLs == 0:
                                return 0
                        else:
                                return float(self.htmlParser.getNumberOfSubDomains(self.htmlResponse))/self.numOfURLs
                return 0