Example #1
0
def _extractFromDocument(extractorSelector, filepath, documentClass, index=None):   
    documentString = readFromFile(filepath)
    print "\n---------\nDocument:\n---------\n", documentString, "\n---------"
    processedDocument = PreProcessor.removeEscapeChars(documentString)

    if index is not None:
            parser = TextParser(os.getcwd()+"/Parsers")
            email, isMultipart = parser.getEmailFromString(documentString)
            payload = email.get_payload()
            
            print "Email no. "+str(index)+": "

            print "---------"
            for header in email.keys():
                    print "\n"+header+": "+email.get(header)
            print "\nPayload: "+payload
            print "---------"

            processedPayload = PreProcessor.removeEscapeChars(payload)
            return _selectExtractorAndProcess(extractorSelector,\
                                              processedDocument,\
                                              documentClass,\
                                              email.get("Message-Id"),\
                                              processedPayload)
    else:
            return _selectExtractorAndProcess(extractorSelector,\
                                              processedDocument,\
                                              documentClass)                      
Example #2
0
def _extractFromDocument(extractorSelector,
                         filepath,
                         documentClass,
                         index=None):
    documentString = readFromFile(filepath)
    print "\n---------\nDocument:\n---------\n", documentString, "\n---------"
    processedDocument = PreProcessor.removeEscapeChars(documentString)

    if index is not None:
        parser = TextParser(os.getcwd() + "/Parsers")
        email, isMultipart = parser.getEmailFromString(documentString)
        payload = email.get_payload()

        print "Email no. " + str(index) + ": "

        print "---------"
        for header in email.keys():
            print "\n" + header + ": " + email.get(header)
        print "\nPayload: " + payload
        print "---------"

        processedPayload = PreProcessor.removeEscapeChars(payload)
        return _selectExtractorAndProcess(extractorSelector,\
                                          processedDocument,\
                                          documentClass,\
                                          email.get("Message-Id"),\
                                          processedPayload)
    else:
        return _selectExtractorAndProcess(extractorSelector,\
                                          processedDocument,\
                                          documentClass)
Example #3
0
    def __init__(self, *args):
        ###Defaults###

        cpuCount = multiprocessing.cpu_count()
        self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\
                                    else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8,
        #else is approx. or exactly 3/4 of the total CPU count.
        self.extractorDictionary = {'text': gfe(), 'html': gfe()}
        self.documentPaths = []
        self.extractorSelector = None
        self.isParallel = True

        self.matrixDict = OrderedDict()
        self.svms = None
        self.dTrees = None
        self.naiveBayes = None

        ###Dependency checks###

        if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict')
                and downloadNLTKData('wordnet')):
            raise RuntimeError(
                "\n\nCould not download the required nltk dependencies.\n")

        ###User arguments###
        #Text must be delimited by semi-colon, in
        #each file passed into the program
        options, extras = getopt.getopt(args, 'd:p:',
                                        ['documentlist=', 'parallel='])

        for opt, arg in options:
            path = normpath(arg)

            if opt in ('-d', '--documentlist'):
                documentListString = readFromFile(path)

                for ch in ('\n', '\t', ' '):  #Removes unnecessary characters
                    if ch in documentListString:
                        documentListString = documentListString.replace(ch, '')

                self.documentPaths = self._getDocumentPaths(documentListString)

            if opt in ('-p', '--parallel'):
                if isinstance(arg, basestring) and len(arg) == 1:
                    option = int(arg)

                    if option == 0:
                        self.isParallel = False
                    elif option == 1:
                        self.isParallel = True

        self.extractorSelector = self._createExtractor(
            self.extractorDictionary)
Example #4
0
File: Main.py Project: Quantza/fyp
        def __init__(self, *args):
                ###Defaults###

                cpuCount = multiprocessing.cpu_count()
                self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\
                                            else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8,
                                                                                #else is approx. or exactly 3/4 of the total CPU count.
                self.extractorDictionary = {'text':gfe(), 'html':gfe()}
                self.documentPaths = []
                self.extractorSelector = None
                self.isParallel = True

                self.matrixDict = OrderedDict()
                self.svms = None
                self.dTrees = None
                self.naiveBayes = None

                ###Dependency checks###

                if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict')
                        and downloadNLTKData('wordnet')):
                        raise RuntimeError("\n\nCould not download the required nltk dependencies.\n")                

                ###User arguments###
                #Text must be delimited by semi-colon, in 
                #each file passed into the program
                options, extras = getopt.getopt(args, 'd:p:', ['documentlist=', 'parallel='])
                
                for opt, arg in options:
                        path = normpath(arg)
                
                        if opt in ('-d', '--documentlist'):
                                documentListString = readFromFile(path)
                                
                                for ch in ('\n', '\t', ' '): #Removes unnecessary characters
                                    if ch in documentListString:
                                        documentListString = documentListString.replace(ch, '')

                                self.documentPaths = self._getDocumentPaths(documentListString)
                                
                        if opt in ('-p', '--parallel'):
                                if isinstance(arg, basestring) and len(arg) == 1:
                                    option = int(arg)
                                    
                                    if option == 0:
                                        self.isParallel = False
                                    elif option == 1:
                                        self.isParallel = True
                                
        
                self.extractorSelector = self._createExtractor(self.extractorDictionary)
Example #5
0
        def tagTextFile(self, documentName, textFilePath, useCriteria=False):
                tempTaggedText, finalList = [], []
                textFile = readFromFile(textFilePath)
                
                for line in textFile.splitlines():
                        tempTaggedText.extend(self.stanfordTagger.tag(line.split()))

                if useCriteria:
                        for x, y in tempTaggedText:
                                if y in self.tagCriteria:
                                        finalList.append((x, y))
                else:
                        for x, y in tempTaggedText:
                                finalList.append((x, y))
                                

                self.taggedText[documentName] = finalList