def __init__(self, *args): ###Defaults### cpuCount = multiprocessing.cpu_count() self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\ else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8, #else is approx. or exactly 3/4 of the total CPU count. self.extractorDictionary = {'text': gfe(), 'html': gfe()} self.documentPaths = [] self.extractorSelector = None self.isParallel = True self.matrixDict = OrderedDict() self.svms = None self.dTrees = None self.naiveBayes = None ###Dependency checks### if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict') and downloadNLTKData('wordnet')): raise RuntimeError( "\n\nCould not download the required nltk dependencies.\n") ###User arguments### #Text must be delimited by semi-colon, in #each file passed into the program options, extras = getopt.getopt(args, 'd:p:', ['documentlist=', 'parallel=']) for opt, arg in options: path = normpath(arg) if opt in ('-d', '--documentlist'): documentListString = readFromFile(path) for ch in ('\n', '\t', ' '): #Removes unnecessary characters if ch in documentListString: documentListString = documentListString.replace(ch, '') self.documentPaths = self._getDocumentPaths(documentListString) if opt in ('-p', '--parallel'): if isinstance(arg, basestring) and len(arg) == 1: option = int(arg) if option == 0: self.isParallel = False elif option == 1: self.isParallel = True self.extractorSelector = self._createExtractor( self.extractorDictionary)
def __init__(self, *args): ###Defaults### cpuCount = multiprocessing.cpu_count() self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\ else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8, #else is approx. or exactly 3/4 of the total CPU count. self.extractorDictionary = {'text':gfe(), 'html':gfe()} self.documentPaths = [] self.extractorSelector = None self.isParallel = True self.matrixDict = OrderedDict() self.svms = None self.dTrees = None self.naiveBayes = None ###Dependency checks### if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict') and downloadNLTKData('wordnet')): raise RuntimeError("\n\nCould not download the required nltk dependencies.\n") ###User arguments### #Text must be delimited by semi-colon, in #each file passed into the program options, extras = getopt.getopt(args, 'd:p:', ['documentlist=', 'parallel=']) for opt, arg in options: path = normpath(arg) if opt in ('-d', '--documentlist'): documentListString = readFromFile(path) for ch in ('\n', '\t', ' '): #Removes unnecessary characters if ch in documentListString: documentListString = documentListString.replace(ch, '') self.documentPaths = self._getDocumentPaths(documentListString) if opt in ('-p', '--parallel'): if isinstance(arg, basestring) and len(arg) == 1: option = int(arg) if option == 0: self.isParallel = False elif option == 1: self.isParallel = True self.extractorSelector = self._createExtractor(self.extractorDictionary)
def __init__(self, extractorDict): for category in extractorDict: extractor = extractorDict[category] if isinstance(extractor, be): self.categoryDictionary[category] = set(extractor.indicators) else: self.categoryDictionary[category] = set() self.extractorDictionary = extractorDict self.defaultExtractor = gfe()