Ejemplo n.º 1
0
    def extractAllDocuments(self):
        featureMatrix = []
        if self.documentPaths:  #List is not empty
            """
                        tp = TextParser("./Parsers/")
                        exDict = self.extractorSelector.extractorDictionary
                        for label, document in self.documentPaths:
                            for ex in exDict:
                                exDict[ex].setFunctionArgTuple( (getTagVec, [tp, readFromFile(document)]) )
                        """

            if self.isParallel:  #Parallel execution
                argsList = [(pickle.dumps(self.extractorSelector), convertString(document), label)\
                                                         for label, document in self.documentPaths]

                if len(
                        argsList
                ) < self.maxParallelCoreCount:  #Less documents than available cores...
                    self.maxParallelCoreCount = len(
                        argsList)  #Reduce core count == no. of documents

                documentList = [pickle.loads(item) for item in\
                                ListProcessor.map( ParallelExtractor, argsList, options=[('popen', self.maxParallelCoreCount )] )]

            else:  #Sequential execution
                argsList = [(self.extractorSelector, convertString(document), label)\
                            for label, document in self.documentPaths]

                documentList = [
                    pickle.loads(_extractFromDocument(arg[0], *arg[1:]))
                    for arg in argsList
                ]

            for l in documentList:
                featureMatrix.extend(l)

        else:  #No documents found
            sys.stderr.write(
                "Could not find any documents.\nPlease try again, or enter another file, or directory path.\n"
            )
            return

        for featureSet in featureMatrix:
            category = featureSet.documentCategory  #category = FeatureExtractor name, e.g. 'TextFeatureExtractor'

            if category not in self.matrixDict:
                self.matrixDict[category] = [
                    [], []
                ]  #[[classes/labels][associated vectors]]

            self.matrixDict[category][0].append(featureSet.getClass())
            self.matrixDict[category][1].append(featureSet.getVector())

        print "---"
        for k in self.matrixDict:
            print k
            print self.matrixDict[k]
            print "---"
Ejemplo n.º 2
0
Archivo: Main.py Proyecto: Quantza/fyp
        def extractAllDocuments(self):
                featureMatrix = []
                if self.documentPaths:  #List is not empty
                        """
                        tp = TextParser("./Parsers/")
                        exDict = self.extractorSelector.extractorDictionary
                        for label, document in self.documentPaths:
                            for ex in exDict:
                                exDict[ex].setFunctionArgTuple( (getTagVec, [tp, readFromFile(document)]) )
                        """

                        if self.isParallel: #Parallel execution
                            argsList = [(pickle.dumps(self.extractorSelector), convertString(document), label)\
                                                                     for label, document in self.documentPaths]

                            if len(argsList) < self.maxParallelCoreCount: #Less documents than available cores...
                                self.maxParallelCoreCount = len(argsList) #Reduce core count == no. of documents                      
                            
                            documentList = [pickle.loads(item) for item in\
                                            ListProcessor.map( ParallelExtractor, argsList, options=[('popen', self.maxParallelCoreCount )] )]

                        else:               #Sequential execution
                            argsList = [(self.extractorSelector, convertString(document), label)\
                                        for label, document in self.documentPaths]

                            documentList = [pickle.loads(_extractFromDocument(arg[0], *arg[1:])) for arg in argsList]
                
                        
                        for l in documentList:
                            featureMatrix.extend(l)
                        
                        
                else:   #No documents found
                        sys.stderr.write("Could not find any documents.\nPlease try again, or enter another file, or directory path.\n")
                        return
                
                for featureSet in featureMatrix:
                        category = featureSet.documentCategory #category = FeatureExtractor name, e.g. 'TextFeatureExtractor'

                        if category not in self.matrixDict:
                                self.matrixDict[category] = [[],[]] #[[classes/labels][associated vectors]]

                        self.matrixDict[category][0].append(featureSet.getClass())
                        self.matrixDict[category][1].append(featureSet.getVector())
                
                print "---"
                for k in self.matrixDict:
                        print k
                        print self.matrixDict[k] 
                        print "---"
Ejemplo n.º 3
0
    def startMainMenu(self):
        time.sleep(0.5)
        #Wait for Fake SMTP Server to start...
        while True:
            option = -1
            documentClass = -1
            documentPath = None

            print "\n-------------------------\nSET Deception Detector:\n-------------------------"
            if self.isParallel:
                print "CPU Cores to be in use: %d\n" % self.maxParallelCoreCount
            print "Press a number associated with the following options."
            print "1) Classify document\n2) Train program with documents\n3) Exit"

            while not isinstance(option, (int))\
                       or (option < 1 or option > 3):
                try:
                    option = int(raw_input("Please choose a valid option.\n"))
                except ValueError:
                    option = -1

            if option is 1:  #Classify a document

                while not isinstance(documentClass, (int))\
                       or (documentClass not in (0, 1)):
                    inputMessage = raw_input("Please enter a valid class.\n0 = You consider the document to "+\
                                                          "be deceptive.\n1 = You consider the document to be non-deceptive.\nPress 'b' or 'back' to exit data entry.\n")
                    if inputMessage in ('b', 'back'):
                        documentClass = None
                        break

                    try:
                        documentClass = int(inputMessage)
                    except ValueError:
                        documentClass = None

                if documentClass is not None:
                    inputMessage = raw_input(
                        "Now enter the filepath of the document to classify.\nPress 'b' or 'back' to exit data entry.\n"
                    )
                    documentPath = normpath(inputMessage)

                    while (not isinstance(documentPath, basestring)) or (
                            not isfile(documentPath)):

                        if inputMessage in ('b', 'back'):
                            documentPath = None
                            break

                        inputMessage = raw_input(
                            "Please enter a valid filepath.\nPress 'b' or 'back' to exit data entry.\n"
                        )
                        documentPath = normpath(inputMessage)

                    if documentPath is not None:
                        featureSetList = pickle.loads(
                            _extractFromDocument(self.extractorSelector,
                                                 documentPath, documentClass))
                        for featureSet in featureSetList:
                            for each_classification in self.classifyDocument(\
                                featureSet.documentCategory, documentClass, featureSet.getVector()):
                                print each_classification

            elif option is 2:  #Train classifiers with data from documents

                inputMessage = None
                documentPaths = None
                paths = None

                if len(sys.argv) is 1 or\
                   (len(sys.argv) == 3 and '-p' in sys.argv): #No arguments passed to program
                    # OR Parallel/Sequential processing selection argument
                    message = "\nNow enter the directory path or filepath of the document(s) "\
                              +"to use for training, using the following format:\n[class integer],[directory/file path];\n"\
                     +"----------------------------\nThe class integer represents the expected classification of the associated document/file:\n"\
                     +"deceptive = 0, non-deceptive = 1.\n"\
                     +"\nYou can enter in as many of these lines, as you'd like.\nPress 'b' or 'back' to exit data entry.\n"

                    while not isinstance(paths, basestring) or not (
                            isfile(paths) or isdir(paths)):
                        inputMessage = raw_input(message)

                        if inputMessage in ('b', 'back'):
                            documentPaths = None
                            break

                        documentPaths = normpath(inputMessage)
                        try:
                            paths = documentPaths.split(',')[1].split(';')[
                                0]  #Separate classes and file/directory paths
                        except IndexError:
                            paths = None

                    if documentPaths is not None:  #If file/directory paths exist...
                        for ch in ('\n', '\t',
                                   ' '):  #Removes unnecessary characters
                            if ch in documentPaths:
                                documentPaths = documentPaths.replace(ch, '')

                        self.documentPaths = self._getDocumentPaths(
                            documentPaths
                        )  #If entering in the document list, on the fly...

                if inputMessage not in ('b', 'back')\
                   and self.documentPaths is not None: #If not leaving data entry...
                    detector.extractAllDocuments()
                    detector.trainClassifiers()

            elif option is 3:  #Quit program
                sys.exit(0)
Ejemplo n.º 4
0
Archivo: Main.py Proyecto: Quantza/fyp
        def startMainMenu(self):
                time.sleep(0.5); #Wait for Fake SMTP Server to start...
                while True:
                        option = -1
                        documentClass = -1
                        documentPath = None
                        
                        print "\n-------------------------\nSET Deception Detector:\n-------------------------"
                        if self.isParallel:
                            print "CPU Cores to be in use: %d\n" %self.maxParallelCoreCount
                        print "Press a number associated with the following options."
                        print "1) Classify document\n2) Train program with documents\n3) Exit"
                        
                        while not isinstance(option, (int))\
                                   or (option < 1 or option > 3):
                                try:
                                    option = int(raw_input("Please choose a valid option.\n"))
                                except ValueError:
                                    option = -1
                                
                        if option is 1:     #Classify a document

                                while not isinstance(documentClass, (int))\
                                       or (documentClass not in (0, 1)):
                                        inputMessage = raw_input("Please enter a valid class.\n0 = You consider the document to "+\
                                                                              "be deceptive.\n1 = You consider the document to be non-deceptive.\nPress 'b' or 'back' to exit data entry.\n")
                                        if inputMessage in ('b', 'back'):
                                            documentClass = None
                                            break
                                        
                                        try:
                                                documentClass = int(inputMessage)
                                        except ValueError:
                                                documentClass = None

                                if documentClass is not None:
                                    inputMessage = raw_input("Now enter the filepath of the document to classify.\nPress 'b' or 'back' to exit data entry.\n")
                                    documentPath = normpath(inputMessage)
                                    
                                    while (not isinstance(documentPath, basestring)) or (not isfile(documentPath)):

                                            if inputMessage in ('b', 'back'):
                                                documentPath = None
                                                break

                                            inputMessage = raw_input("Please enter a valid filepath.\nPress 'b' or 'back' to exit data entry.\n")
                                            documentPath = normpath(inputMessage)

                                    if documentPath is not None:
                                        featureSetList = pickle.loads(_extractFromDocument(self.extractorSelector, documentPath, documentClass))
                                        for featureSet in featureSetList:
                                                for each_classification in self.classifyDocument(\
                                                    featureSet.documentCategory, documentClass, featureSet.getVector()):
                                                    print each_classification
                                
                        elif option is 2:   #Train classifiers with data from documents

                                inputMessage = None
                                documentPaths = None
                                paths = None
                                
                                if len(sys.argv) is 1 or\
                                   (len(sys.argv) == 3 and '-p' in sys.argv): #No arguments passed to program
                                                                              # OR Parallel/Sequential processing selection argument
                                        message = "\nNow enter the directory path or filepath of the document(s) "\
                                                  +"to use for training, using the following format:\n[class integer],[directory/file path];\n"\
                                         +"----------------------------\nThe class integer represents the expected classification of the associated document/file:\n"\
                                         +"deceptive = 0, non-deceptive = 1.\n"\
                                         +"\nYou can enter in as many of these lines, as you'd like.\nPress 'b' or 'back' to exit data entry.\n"
                                        
                                        while not isinstance(paths, basestring) or not (isfile(paths) or isdir(paths)):
                                                inputMessage = raw_input(message)

                                                if inputMessage in ('b', 'back'):
                                                    documentPaths = None
                                                    break
                                                
                                                documentPaths = normpath(inputMessage)
                                                try:
                                                        paths = documentPaths.split(',')[1].split(';')[0] #Separate classes and file/directory paths
                                                except IndexError:
                                                        paths = None

                                        if documentPaths is not None: #If file/directory paths exist...
                                            for ch in ('\n', '\t', ' '): #Removes unnecessary characters
                                                if ch in documentPaths:
                                                    documentPaths = documentPaths.replace(ch, '')
                                                            
                                            self.documentPaths = self._getDocumentPaths(documentPaths) #If entering in the document list, on the fly...

                                if inputMessage not in ('b', 'back')\
                                   and self.documentPaths is not None: #If not leaving data entry...
                                    detector.extractAllDocuments()      
                                    detector.trainClassifiers()
                                
                        elif option is 3:   #Quit program
                                sys.exit(0)