def _extractFromDocument(extractorSelector, filepath, documentClass, index=None): documentString = readFromFile(filepath) print "\n---------\nDocument:\n---------\n", documentString, "\n---------" processedDocument = PreProcessor.removeEscapeChars(documentString) if index is not None: parser = TextParser(os.getcwd()+"/Parsers") email, isMultipart = parser.getEmailFromString(documentString) payload = email.get_payload() print "Email no. "+str(index)+": " print "---------" for header in email.keys(): print "\n"+header+": "+email.get(header) print "\nPayload: "+payload print "---------" processedPayload = PreProcessor.removeEscapeChars(payload) return _selectExtractorAndProcess(extractorSelector,\ processedDocument,\ documentClass,\ email.get("Message-Id"),\ processedPayload) else: return _selectExtractorAndProcess(extractorSelector,\ processedDocument,\ documentClass)
def _extractFromDocument(extractorSelector, filepath, documentClass, index=None): documentString = readFromFile(filepath) print "\n---------\nDocument:\n---------\n", documentString, "\n---------" processedDocument = PreProcessor.removeEscapeChars(documentString) if index is not None: parser = TextParser(os.getcwd() + "/Parsers") email, isMultipart = parser.getEmailFromString(documentString) payload = email.get_payload() print "Email no. " + str(index) + ": " print "---------" for header in email.keys(): print "\n" + header + ": " + email.get(header) print "\nPayload: " + payload print "---------" processedPayload = PreProcessor.removeEscapeChars(payload) return _selectExtractorAndProcess(extractorSelector,\ processedDocument,\ documentClass,\ email.get("Message-Id"),\ processedPayload) else: return _selectExtractorAndProcess(extractorSelector,\ processedDocument,\ documentClass)
def __init__(self, *args): ###Defaults### cpuCount = multiprocessing.cpu_count() self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\ else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8, #else is approx. or exactly 3/4 of the total CPU count. self.extractorDictionary = {'text': gfe(), 'html': gfe()} self.documentPaths = [] self.extractorSelector = None self.isParallel = True self.matrixDict = OrderedDict() self.svms = None self.dTrees = None self.naiveBayes = None ###Dependency checks### if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict') and downloadNLTKData('wordnet')): raise RuntimeError( "\n\nCould not download the required nltk dependencies.\n") ###User arguments### #Text must be delimited by semi-colon, in #each file passed into the program options, extras = getopt.getopt(args, 'd:p:', ['documentlist=', 'parallel=']) for opt, arg in options: path = normpath(arg) if opt in ('-d', '--documentlist'): documentListString = readFromFile(path) for ch in ('\n', '\t', ' '): #Removes unnecessary characters if ch in documentListString: documentListString = documentListString.replace(ch, '') self.documentPaths = self._getDocumentPaths(documentListString) if opt in ('-p', '--parallel'): if isinstance(arg, basestring) and len(arg) == 1: option = int(arg) if option == 0: self.isParallel = False elif option == 1: self.isParallel = True self.extractorSelector = self._createExtractor( self.extractorDictionary)
def __init__(self, *args): ###Defaults### cpuCount = multiprocessing.cpu_count() self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\ else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8, #else is approx. or exactly 3/4 of the total CPU count. self.extractorDictionary = {'text':gfe(), 'html':gfe()} self.documentPaths = [] self.extractorSelector = None self.isParallel = True self.matrixDict = OrderedDict() self.svms = None self.dTrees = None self.naiveBayes = None ###Dependency checks### if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict') and downloadNLTKData('wordnet')): raise RuntimeError("\n\nCould not download the required nltk dependencies.\n") ###User arguments### #Text must be delimited by semi-colon, in #each file passed into the program options, extras = getopt.getopt(args, 'd:p:', ['documentlist=', 'parallel=']) for opt, arg in options: path = normpath(arg) if opt in ('-d', '--documentlist'): documentListString = readFromFile(path) for ch in ('\n', '\t', ' '): #Removes unnecessary characters if ch in documentListString: documentListString = documentListString.replace(ch, '') self.documentPaths = self._getDocumentPaths(documentListString) if opt in ('-p', '--parallel'): if isinstance(arg, basestring) and len(arg) == 1: option = int(arg) if option == 0: self.isParallel = False elif option == 1: self.isParallel = True self.extractorSelector = self._createExtractor(self.extractorDictionary)
def tagTextFile(self, documentName, textFilePath, useCriteria=False): tempTaggedText, finalList = [], [] textFile = readFromFile(textFilePath) for line in textFile.splitlines(): tempTaggedText.extend(self.stanfordTagger.tag(line.split())) if useCriteria: for x, y in tempTaggedText: if y in self.tagCriteria: finalList.append((x, y)) else: for x, y in tempTaggedText: finalList.append((x, y)) self.taggedText[documentName] = finalList