Ejemplo n.º 1
0
        def getFeatureSet(self, documentName, documentCategory, params=None, documentClass=-1):
                memberList = inspect.getmembers(self, predicate=inspect.ismethod)
                self.featureSet = FeatureSet(documentName, documentCategory, documentClass)

                if self.functionToCall is not None\
                   and self.paramList is not None:
                        self.featureSet.setVector(self.functionToCall(*self.paramList))
                        return self.featureSet

                if (not self.tagged) and isinstance(params, basestring):
                        self.textParser.tagText("temp", params)
                        self.tagged = True

                if isinstance(params, (list, tuple)):
                        parameters = params
                elif params is not None:
                        parameters = [params]
                else:
                        parameters = None

                if parameters is not None: #More efficient (less if checks), but some duplicated code
                        for x, y in memberList:
                                if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'):
                                        self.featureSet.addFeature(x, getattr(self, x)(*parameters))
                if parameters is None:
                        for x, y in memberList:
                                if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'):
                                        self.featureSet.addFeature(x, getattr(self, x)())        
                self.tagged = False
                
                return self.featureSet
Ejemplo n.º 2
0
    def getFeatureSet(self,
                      documentName,
                      documentCategory,
                      params=None,
                      documentClass=-1):
        memberList = inspect.getmembers(self, predicate=inspect.ismethod)
        self.featureSet = FeatureSet(documentName, documentCategory,
                                     documentClass)

        if self.functionToCall is not None\
           and self.paramList is not None:
            self.featureSet.setVector(self.functionToCall(*self.paramList))
            return self.featureSet

        if (not self.tagged) and isinstance(params, basestring):
            self.textParser.tagText("temp", params)
            self.tagged = True

        if isinstance(params, (list, tuple)):
            parameters = params
        elif params is not None:
            parameters = [params]
        else:
            parameters = None

        if parameters is not None:  #More efficient (less if checks), but some duplicated code
            for x, y in memberList:
                if x[0] != '_' and x not in ('getFeatureSet',
                                             'setFunctionArgTuple',
                                             'scrapeWebsiteFromURL'):
                    self.featureSet.addFeature(x,
                                               getattr(self, x)(*parameters))
        if parameters is None:
            for x, y in memberList:
                if x[0] != '_' and x not in ('getFeatureSet',
                                             'setFunctionArgTuple',
                                             'scrapeWebsiteFromURL'):
                    self.featureSet.addFeature(x, getattr(self, x)())
        self.tagged = False

        return self.featureSet
Ejemplo n.º 3
0
class BaseExtractor():
    def __init__(self,
                 documentName,
                 indicators=None,
                 functionToCall=None,
                 paramList=None):
        self.featureSet = None
        self.documentName = documentName
        self.tagged = False

        pathToParser = os.getcwd() + "/Parsers"
        self.textParser = TextParser(pathToParser)
        self.htmlParser = HTMLParser()
        self.functionToCall = functionToCall
        self.paramList = paramList

        if isinstance(indicators,
                      (list, tuple)):  #If 'indicators' is a list or tuple
            self.indicators = indicators
        else:
            self.indicators = []

    def getFeatureSet(self,
                      documentName,
                      documentCategory,
                      params=None,
                      documentClass=-1):
        memberList = inspect.getmembers(self, predicate=inspect.ismethod)
        self.featureSet = FeatureSet(documentName, documentCategory,
                                     documentClass)

        if self.functionToCall is not None\
           and self.paramList is not None:
            self.featureSet.setVector(self.functionToCall(*self.paramList))
            return self.featureSet

        if (not self.tagged) and isinstance(params, basestring):
            self.textParser.tagText("temp", params)
            self.tagged = True

        if isinstance(params, (list, tuple)):
            parameters = params
        elif params is not None:
            parameters = [params]
        else:
            parameters = None

        if parameters is not None:  #More efficient (less if checks), but some duplicated code
            for x, y in memberList:
                if x[0] != '_' and x not in ('getFeatureSet',
                                             'setFunctionArgTuple',
                                             'scrapeWebsiteFromURL'):
                    self.featureSet.addFeature(x,
                                               getattr(self, x)(*parameters))
        if parameters is None:
            for x, y in memberList:
                if x[0] != '_' and x not in ('getFeatureSet',
                                             'setFunctionArgTuple',
                                             'scrapeWebsiteFromURL'):
                    self.featureSet.addFeature(x, getattr(self, x)())
        self.tagged = False

        return self.featureSet

    def setFunctionArgTuple(self, functionArgTuple):
        if functionArgTuple is not None:
            self._setFunctionToCall(functionArgTuple[0])
            self._setFunctionParams(functionArgTuple[1])

    ###Utility Functions###

    #-1 means an error has occurred - e.g. wrong parameter type passed into function

    def _charCountInString(self, textString, char):
        if isinstance(textString, basestring) and \
        isinstance(char, basestring) and len(char) == 1:
            return len(textString.split(char)) - 1
        else:
            return -1

    #If charCount == 0 return value = 1
    #If charCount is far greater than 0, return value approaches 0
    #0.1 constant chosen, to reduce the effect of a chosen character being introduced

    #(in future constant should be based on 1/(Average Amount Of A Character In All Documents)

    #Done since there could be many of these special characters, over the span of a single document,
    #but not too many (max = approx. 25, for emails/websites [MUST RESEARCH TO DETERMINE IF VALID]),
    #making resulting values in range over the internal [0, 1] be spread out more evenly

    def _lackOfCharInString(self, textString, char):
        count = self._charCountInString(textString, char)
        if count != -1:
            try:
                return 1 / float(1 + (0.1 * count))
            except Exception, e:
                raise e
        else:
Ejemplo n.º 4
0
class BaseExtractor():
        def __init__(self, documentName, indicators=None, functionToCall=None, paramList=None):
                self.featureSet = None
                self.documentName = documentName
                self.tagged = False

                pathToParser = os.getcwd()+"/Parsers"
                self.textParser = TextParser(pathToParser)
                self.htmlParser = HTMLParser()
                self.functionToCall = functionToCall
                self.paramList = paramList
                
                if isinstance(indicators, (list, tuple)): #If 'indicators' is a list or tuple
                        self.indicators = indicators
                else:
                        self.indicators = []
                
        def getFeatureSet(self, documentName, documentCategory, params=None, documentClass=-1):
                memberList = inspect.getmembers(self, predicate=inspect.ismethod)
                self.featureSet = FeatureSet(documentName, documentCategory, documentClass)

                if self.functionToCall is not None\
                   and self.paramList is not None:
                        self.featureSet.setVector(self.functionToCall(*self.paramList))
                        return self.featureSet

                if (not self.tagged) and isinstance(params, basestring):
                        self.textParser.tagText("temp", params)
                        self.tagged = True

                if isinstance(params, (list, tuple)):
                        parameters = params
                elif params is not None:
                        parameters = [params]
                else:
                        parameters = None

                if parameters is not None: #More efficient (less if checks), but some duplicated code
                        for x, y in memberList:
                                if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'):
                                        self.featureSet.addFeature(x, getattr(self, x)(*parameters))
                if parameters is None:
                        for x, y in memberList:
                                if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'):
                                        self.featureSet.addFeature(x, getattr(self, x)())        
                self.tagged = False
                
                return self.featureSet
                        
        def setFunctionArgTuple(self, functionArgTuple):
              if functionArgTuple is not None:
                        self._setFunctionToCall(functionArgTuple[0])
                        self._setFunctionParams(functionArgTuple[1])

        ###Utility Functions###

        #-1 means an error has occurred - e.g. wrong parameter type passed into function

        def _charCountInString(self, textString, char):
                if isinstance(textString, basestring) and \
                isinstance(char, basestring) and len(char) == 1:
                        return len(textString.split(char))-1
                else:
                        return -1

        #If charCount == 0 return value = 1
        #If charCount is far greater than 0, return value approaches 0
        #0.1 constant chosen, to reduce the effect of a chosen character being introduced

        #(in future constant should be based on 1/(Average Amount Of A Character In All Documents)

        #Done since there could be many of these special characters, over the span of a single document,
        #but not too many (max = approx. 25, for emails/websites [MUST RESEARCH TO DETERMINE IF VALID]), 
        #making resulting values in range over the internal [0, 1] be spread out more evenly

        def _lackOfCharInString(self, textString, char):
                count = self._charCountInString(textString, char)
                if count != -1:
                        try:
                                return 1/float(1+(0.1*count))
                        except Exception, e:
                                raise e
                else: