def extractAllDocuments(self): featureMatrix = [] if self.documentPaths: #List is not empty """ tp = TextParser("./Parsers/") exDict = self.extractorSelector.extractorDictionary for label, document in self.documentPaths: for ex in exDict: exDict[ex].setFunctionArgTuple( (getTagVec, [tp, readFromFile(document)]) ) """ if self.isParallel: #Parallel execution argsList = [(pickle.dumps(self.extractorSelector), convertString(document), label)\ for label, document in self.documentPaths] if len( argsList ) < self.maxParallelCoreCount: #Less documents than available cores... self.maxParallelCoreCount = len( argsList) #Reduce core count == no. of documents documentList = [pickle.loads(item) for item in\ ListProcessor.map( ParallelExtractor, argsList, options=[('popen', self.maxParallelCoreCount )] )] else: #Sequential execution argsList = [(self.extractorSelector, convertString(document), label)\ for label, document in self.documentPaths] documentList = [ pickle.loads(_extractFromDocument(arg[0], *arg[1:])) for arg in argsList ] for l in documentList: featureMatrix.extend(l) else: #No documents found sys.stderr.write( "Could not find any documents.\nPlease try again, or enter another file, or directory path.\n" ) return for featureSet in featureMatrix: category = featureSet.documentCategory #category = FeatureExtractor name, e.g. 'TextFeatureExtractor' if category not in self.matrixDict: self.matrixDict[category] = [ [], [] ] #[[classes/labels][associated vectors]] self.matrixDict[category][0].append(featureSet.getClass()) self.matrixDict[category][1].append(featureSet.getVector()) print "---" for k in self.matrixDict: print k print self.matrixDict[k] print "---"
def extractAllDocuments(self): featureMatrix = [] if self.documentPaths: #List is not empty """ tp = TextParser("./Parsers/") exDict = self.extractorSelector.extractorDictionary for label, document in self.documentPaths: for ex in exDict: exDict[ex].setFunctionArgTuple( (getTagVec, [tp, readFromFile(document)]) ) """ if self.isParallel: #Parallel execution argsList = [(pickle.dumps(self.extractorSelector), convertString(document), label)\ for label, document in self.documentPaths] if len(argsList) < self.maxParallelCoreCount: #Less documents than available cores... self.maxParallelCoreCount = len(argsList) #Reduce core count == no. of documents documentList = [pickle.loads(item) for item in\ ListProcessor.map( ParallelExtractor, argsList, options=[('popen', self.maxParallelCoreCount )] )] else: #Sequential execution argsList = [(self.extractorSelector, convertString(document), label)\ for label, document in self.documentPaths] documentList = [pickle.loads(_extractFromDocument(arg[0], *arg[1:])) for arg in argsList] for l in documentList: featureMatrix.extend(l) else: #No documents found sys.stderr.write("Could not find any documents.\nPlease try again, or enter another file, or directory path.\n") return for featureSet in featureMatrix: category = featureSet.documentCategory #category = FeatureExtractor name, e.g. 'TextFeatureExtractor' if category not in self.matrixDict: self.matrixDict[category] = [[],[]] #[[classes/labels][associated vectors]] self.matrixDict[category][0].append(featureSet.getClass()) self.matrixDict[category][1].append(featureSet.getVector()) print "---" for k in self.matrixDict: print k print self.matrixDict[k] print "---"
def startMainMenu(self): time.sleep(0.5) #Wait for Fake SMTP Server to start... while True: option = -1 documentClass = -1 documentPath = None print "\n-------------------------\nSET Deception Detector:\n-------------------------" if self.isParallel: print "CPU Cores to be in use: %d\n" % self.maxParallelCoreCount print "Press a number associated with the following options." print "1) Classify document\n2) Train program with documents\n3) Exit" while not isinstance(option, (int))\ or (option < 1 or option > 3): try: option = int(raw_input("Please choose a valid option.\n")) except ValueError: option = -1 if option is 1: #Classify a document while not isinstance(documentClass, (int))\ or (documentClass not in (0, 1)): inputMessage = raw_input("Please enter a valid class.\n0 = You consider the document to "+\ "be deceptive.\n1 = You consider the document to be non-deceptive.\nPress 'b' or 'back' to exit data entry.\n") if inputMessage in ('b', 'back'): documentClass = None break try: documentClass = int(inputMessage) except ValueError: documentClass = None if documentClass is not None: inputMessage = raw_input( "Now enter the filepath of the document to classify.\nPress 'b' or 'back' to exit data entry.\n" ) documentPath = normpath(inputMessage) while (not isinstance(documentPath, basestring)) or ( not isfile(documentPath)): if inputMessage in ('b', 'back'): documentPath = None break inputMessage = raw_input( "Please enter a valid filepath.\nPress 'b' or 'back' to exit data entry.\n" ) documentPath = normpath(inputMessage) if documentPath is not None: featureSetList = pickle.loads( _extractFromDocument(self.extractorSelector, documentPath, documentClass)) for featureSet in featureSetList: for each_classification in self.classifyDocument(\ featureSet.documentCategory, documentClass, featureSet.getVector()): print each_classification elif option is 2: #Train classifiers with data from documents inputMessage = None documentPaths = None paths = None if len(sys.argv) is 1 or\ (len(sys.argv) == 3 and '-p' in sys.argv): #No arguments passed to program # OR Parallel/Sequential processing selection argument message = "\nNow enter the directory path or filepath of the document(s) "\ +"to use for training, using the following format:\n[class integer],[directory/file path];\n"\ +"----------------------------\nThe class integer represents the expected classification of the associated document/file:\n"\ +"deceptive = 0, non-deceptive = 1.\n"\ +"\nYou can enter in as many of these lines, as you'd like.\nPress 'b' or 'back' to exit data entry.\n" while not isinstance(paths, basestring) or not ( isfile(paths) or isdir(paths)): inputMessage = raw_input(message) if inputMessage in ('b', 'back'): documentPaths = None break documentPaths = normpath(inputMessage) try: paths = documentPaths.split(',')[1].split(';')[ 0] #Separate classes and file/directory paths except IndexError: paths = None if documentPaths is not None: #If file/directory paths exist... for ch in ('\n', '\t', ' '): #Removes unnecessary characters if ch in documentPaths: documentPaths = documentPaths.replace(ch, '') self.documentPaths = self._getDocumentPaths( documentPaths ) #If entering in the document list, on the fly... if inputMessage not in ('b', 'back')\ and self.documentPaths is not None: #If not leaving data entry... detector.extractAllDocuments() detector.trainClassifiers() elif option is 3: #Quit program sys.exit(0)
def startMainMenu(self): time.sleep(0.5); #Wait for Fake SMTP Server to start... while True: option = -1 documentClass = -1 documentPath = None print "\n-------------------------\nSET Deception Detector:\n-------------------------" if self.isParallel: print "CPU Cores to be in use: %d\n" %self.maxParallelCoreCount print "Press a number associated with the following options." print "1) Classify document\n2) Train program with documents\n3) Exit" while not isinstance(option, (int))\ or (option < 1 or option > 3): try: option = int(raw_input("Please choose a valid option.\n")) except ValueError: option = -1 if option is 1: #Classify a document while not isinstance(documentClass, (int))\ or (documentClass not in (0, 1)): inputMessage = raw_input("Please enter a valid class.\n0 = You consider the document to "+\ "be deceptive.\n1 = You consider the document to be non-deceptive.\nPress 'b' or 'back' to exit data entry.\n") if inputMessage in ('b', 'back'): documentClass = None break try: documentClass = int(inputMessage) except ValueError: documentClass = None if documentClass is not None: inputMessage = raw_input("Now enter the filepath of the document to classify.\nPress 'b' or 'back' to exit data entry.\n") documentPath = normpath(inputMessage) while (not isinstance(documentPath, basestring)) or (not isfile(documentPath)): if inputMessage in ('b', 'back'): documentPath = None break inputMessage = raw_input("Please enter a valid filepath.\nPress 'b' or 'back' to exit data entry.\n") documentPath = normpath(inputMessage) if documentPath is not None: featureSetList = pickle.loads(_extractFromDocument(self.extractorSelector, documentPath, documentClass)) for featureSet in featureSetList: for each_classification in self.classifyDocument(\ featureSet.documentCategory, documentClass, featureSet.getVector()): print each_classification elif option is 2: #Train classifiers with data from documents inputMessage = None documentPaths = None paths = None if len(sys.argv) is 1 or\ (len(sys.argv) == 3 and '-p' in sys.argv): #No arguments passed to program # OR Parallel/Sequential processing selection argument message = "\nNow enter the directory path or filepath of the document(s) "\ +"to use for training, using the following format:\n[class integer],[directory/file path];\n"\ +"----------------------------\nThe class integer represents the expected classification of the associated document/file:\n"\ +"deceptive = 0, non-deceptive = 1.\n"\ +"\nYou can enter in as many of these lines, as you'd like.\nPress 'b' or 'back' to exit data entry.\n" while not isinstance(paths, basestring) or not (isfile(paths) or isdir(paths)): inputMessage = raw_input(message) if inputMessage in ('b', 'back'): documentPaths = None break documentPaths = normpath(inputMessage) try: paths = documentPaths.split(',')[1].split(';')[0] #Separate classes and file/directory paths except IndexError: paths = None if documentPaths is not None: #If file/directory paths exist... for ch in ('\n', '\t', ' '): #Removes unnecessary characters if ch in documentPaths: documentPaths = documentPaths.replace(ch, '') self.documentPaths = self._getDocumentPaths(documentPaths) #If entering in the document list, on the fly... if inputMessage not in ('b', 'back')\ and self.documentPaths is not None: #If not leaving data entry... detector.extractAllDocuments() detector.trainClassifiers() elif option is 3: #Quit program sys.exit(0)