def getFeatureSet(self, documentName, documentCategory, params=None, documentClass=-1): memberList = inspect.getmembers(self, predicate=inspect.ismethod) self.featureSet = FeatureSet(documentName, documentCategory, documentClass) if self.functionToCall is not None\ and self.paramList is not None: self.featureSet.setVector(self.functionToCall(*self.paramList)) return self.featureSet if (not self.tagged) and isinstance(params, basestring): self.textParser.tagText("temp", params) self.tagged = True if isinstance(params, (list, tuple)): parameters = params elif params is not None: parameters = [params] else: parameters = None if parameters is not None: #More efficient (less if checks), but some duplicated code for x, y in memberList: if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'): self.featureSet.addFeature(x, getattr(self, x)(*parameters)) if parameters is None: for x, y in memberList: if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'): self.featureSet.addFeature(x, getattr(self, x)()) self.tagged = False return self.featureSet
class BaseExtractor(): def __init__(self, documentName, indicators=None, functionToCall=None, paramList=None): self.featureSet = None self.documentName = documentName self.tagged = False pathToParser = os.getcwd() + "/Parsers" self.textParser = TextParser(pathToParser) self.htmlParser = HTMLParser() self.functionToCall = functionToCall self.paramList = paramList if isinstance(indicators, (list, tuple)): #If 'indicators' is a list or tuple self.indicators = indicators else: self.indicators = [] def getFeatureSet(self, documentName, documentCategory, params=None, documentClass=-1): memberList = inspect.getmembers(self, predicate=inspect.ismethod) self.featureSet = FeatureSet(documentName, documentCategory, documentClass) if self.functionToCall is not None\ and self.paramList is not None: self.featureSet.setVector(self.functionToCall(*self.paramList)) return self.featureSet if (not self.tagged) and isinstance(params, basestring): self.textParser.tagText("temp", params) self.tagged = True if isinstance(params, (list, tuple)): parameters = params elif params is not None: parameters = [params] else: parameters = None if parameters is not None: #More efficient (less if checks), but some duplicated code for x, y in memberList: if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'): self.featureSet.addFeature(x, getattr(self, x)(*parameters)) if parameters is None: for x, y in memberList: if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'): self.featureSet.addFeature(x, getattr(self, x)()) self.tagged = False return self.featureSet def setFunctionArgTuple(self, functionArgTuple): if functionArgTuple is not None: self._setFunctionToCall(functionArgTuple[0]) self._setFunctionParams(functionArgTuple[1]) ###Utility Functions### #-1 means an error has occurred - e.g. wrong parameter type passed into function def _charCountInString(self, textString, char): if isinstance(textString, basestring) and \ isinstance(char, basestring) and len(char) == 1: return len(textString.split(char)) - 1 else: return -1 #If charCount == 0 return value = 1 #If charCount is far greater than 0, return value approaches 0 #0.1 constant chosen, to reduce the effect of a chosen character being introduced #(in future constant should be based on 1/(Average Amount Of A Character In All Documents) #Done since there could be many of these special characters, over the span of a single document, #but not too many (max = approx. 25, for emails/websites [MUST RESEARCH TO DETERMINE IF VALID]), #making resulting values in range over the internal [0, 1] be spread out more evenly def _lackOfCharInString(self, textString, char): count = self._charCountInString(textString, char) if count != -1: try: return 1 / float(1 + (0.1 * count)) except Exception, e: raise e else:
class BaseExtractor(): def __init__(self, documentName, indicators=None, functionToCall=None, paramList=None): self.featureSet = None self.documentName = documentName self.tagged = False pathToParser = os.getcwd()+"/Parsers" self.textParser = TextParser(pathToParser) self.htmlParser = HTMLParser() self.functionToCall = functionToCall self.paramList = paramList if isinstance(indicators, (list, tuple)): #If 'indicators' is a list or tuple self.indicators = indicators else: self.indicators = [] def getFeatureSet(self, documentName, documentCategory, params=None, documentClass=-1): memberList = inspect.getmembers(self, predicate=inspect.ismethod) self.featureSet = FeatureSet(documentName, documentCategory, documentClass) if self.functionToCall is not None\ and self.paramList is not None: self.featureSet.setVector(self.functionToCall(*self.paramList)) return self.featureSet if (not self.tagged) and isinstance(params, basestring): self.textParser.tagText("temp", params) self.tagged = True if isinstance(params, (list, tuple)): parameters = params elif params is not None: parameters = [params] else: parameters = None if parameters is not None: #More efficient (less if checks), but some duplicated code for x, y in memberList: if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'): self.featureSet.addFeature(x, getattr(self, x)(*parameters)) if parameters is None: for x, y in memberList: if x[0] != '_' and x not in ('getFeatureSet', 'setFunctionArgTuple', 'scrapeWebsiteFromURL'): self.featureSet.addFeature(x, getattr(self, x)()) self.tagged = False return self.featureSet def setFunctionArgTuple(self, functionArgTuple): if functionArgTuple is not None: self._setFunctionToCall(functionArgTuple[0]) self._setFunctionParams(functionArgTuple[1]) ###Utility Functions### #-1 means an error has occurred - e.g. wrong parameter type passed into function def _charCountInString(self, textString, char): if isinstance(textString, basestring) and \ isinstance(char, basestring) and len(char) == 1: return len(textString.split(char))-1 else: return -1 #If charCount == 0 return value = 1 #If charCount is far greater than 0, return value approaches 0 #0.1 constant chosen, to reduce the effect of a chosen character being introduced #(in future constant should be based on 1/(Average Amount Of A Character In All Documents) #Done since there could be many of these special characters, over the span of a single document, #but not too many (max = approx. 25, for emails/websites [MUST RESEARCH TO DETERMINE IF VALID]), #making resulting values in range over the internal [0, 1] be spread out more evenly def _lackOfCharInString(self, textString, char): count = self._charCountInString(textString, char) if count != -1: try: return 1/float(1+(0.1*count)) except Exception, e: raise e else: