Ejemplo n.º 1
0
def _extractFromDocument(extractorSelector,
                         filepath,
                         documentClass,
                         index=None):
    documentString = readFromFile(filepath)
    print "\n---------\nDocument:\n---------\n", documentString, "\n---------"
    processedDocument = PreProcessor.removeEscapeChars(documentString)

    if index is not None:
        parser = TextParser(os.getcwd() + "/Parsers")
        email, isMultipart = parser.getEmailFromString(documentString)
        payload = email.get_payload()

        print "Email no. " + str(index) + ": "

        print "---------"
        for header in email.keys():
            print "\n" + header + ": " + email.get(header)
        print "\nPayload: " + payload
        print "---------"

        processedPayload = PreProcessor.removeEscapeChars(payload)
        return _selectExtractorAndProcess(extractorSelector,\
                                          processedDocument,\
                                          documentClass,\
                                          email.get("Message-Id"),\
                                          processedPayload)
    else:
        return _selectExtractorAndProcess(extractorSelector,\
                                          processedDocument,\
                                          documentClass)
Ejemplo n.º 2
0
    def __init__(self,
                 documentName,
                 indicators=None,
                 functionToCall=None,
                 paramList=None):
        self.featureSet = None
        self.documentName = documentName
        self.tagged = False

        pathToParser = os.getcwd() + "/Parsers"
        self.textParser = TextParser(pathToParser)
        self.htmlParser = HTMLParser()
        self.functionToCall = functionToCall
        self.paramList = paramList

        if isinstance(indicators,
                      (list, tuple)):  #If 'indicators' is a list or tuple
            self.indicators = indicators
        else:
            self.indicators = []