class Collector:

    """Collector application"""
    def __init__(self, config, wordTags):
        self._config = config
        self.__traverse = Traverse(config.dirs)
        self.__posTagger = PartOfSpeechTagger(wordTags)
        self.__taggedData = dict()
        self.__initOutputs()


    def start(self):
        self.__traverse.restrict({'maxSize': 20*1024*1024 })

        # Debugging.
        # self.__traverse.allowTypes(['application/rtf'])
        self.__traverse.allowTypes(self._config.acceptedFiles)

        self.__traverse.onFilePass += self.__fileFoundHandler
        self.__traverse.start()
        self.__output.write(json.dumps(self.__taggedData))
        self.__progressLog.truncate(0)


    def __initOutputs(self):
        # now = datetime.datetime.now()
        # nowFormatted = now.strftime("%y-%m-%d-%H%M%S")
        # Need this file to be consitent when app is run multiple times.
        nowFormatted = ''
        # Progress
        progressLog = 'collector/progress%s.log' % nowFormatted
        # Clear log.
        open(progressLog,'w').close()
        # Open in append mode.
        self.__progressLog = open(progressLog, 'a')
        # Output
        output = '%scollector%s.json' % (self._config.output, nowFormatted)
        try:
            self.__output = open(output, 'w')
        except IOError as e:
            print "[ERROR] Directory %s does not exist. Check config.cfg and create directory if necessary" % self._config.output
            raise


    def __fileFoundHandler(self, filePath, mimeType):
        # Write to stdout.
        print "File found: %s - %s" % (filePath, mimeType)
        # Open file (read only).
        try:
            with open(filePath, 'r') as f:
                data = f.read()
        except IOError:
            self.__handleIOError(filePath)
        else:
            # Get parser dependant on mimetype and do some taggging.
            parser = self.__initParser(data, filePath, mimeType);
            parsedData = parser.parse()
            # Use POSTagger to tag words in parsed text/data.
            self.__posTagger.tag(parsedData)
            self.__merge(self.__posTagger.getByClass())
            # If merge is successful write this filePath to progres log.
            self.__progressLog.write(filePath+'\n')
            f.close()


    def __initParser(self, data, filePath, mimeType):

        config = { "acceptedTags": self._config.htmltags, "filePath": filePath }
        return {
            'text/plain': TextParser(data),
            'text/html': HTMLDocParser(data, config),
            'application/rtf': RTFParser(data, config),
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': MSDocXParser('', config),
            'application/vnd.oasis.opendocument.text': ODTParser('', config),
            'application/pdf': PDFParser('', config),
            #'application/msword': RTFDocParser(data, config),
            }[mimeType]

    def __merge(self, newData):
        # Alphabetise words for grouping.
        for tagKey in newData:
            newData[tagKey] = sorted(newData[tagKey])
        if len(self.__taggedData) == 0:
            # No data yet as this is first file and this is not a
            # continuation of previous scan.
            for tagKey in newData:
                self.__taggedData[tagKey] = {key: len(list(group)) for key, group in groupby(newData[tagKey])}
        else:
            # Either data exists from prior files or from a partial json
            # file created during a previous scan.
            for tagKey in newData:
                for word in newData[tagKey]:
                    if word in self.__taggedData[tagKey]:
                        self.__taggedData[tagKey][word] = self.__taggedData[tagKey][word] + 1
                    else:
                        self.__taggedData[tagKey][word] = 1

    def __orderByFrequency(self):
        for tagKey in self.__taggedData:
            self.__taggedData[tagKey] = sorted(self.__taggedData[tagKey].iteritems(), key=operator.itemgetter(1), reverse=True)
            print self.__taggedData[tagKey]

    def __handleIOError(self, filePath):
        # If IO error then write mask, user and group to log.
        statInfo = os.stat(filePath)
        user = pwd.getpwuid(statInfo.st_uid)[0]
        group = grp.getgrgid(statInfo.st_gid)[0]
        permMask = str(oct(statInfo[ST_MODE] & 0777))
        errorMsg = 'Cannot open file %s. [mask: %s, user: %s, group: %s]' % (filePath, permMask, user, group)
        logging.error(errorMsg)
        print '[ERROR] ' + errorMsg
Example #2
0
 def __init__(self, config, wordTags):
     self._config = config
     self.__posTagger = PartOfSpeechTagger(wordTags)
 def __init__(self, config, wordTags):
     self._config = config
     self.__traverse = Traverse(config.dirs)
     self.__posTagger = PartOfSpeechTagger(wordTags)
     self.__taggedData = dict()
     self.__initOutputs()
Example #4
0
class Poetic:

    """Poetic parser """
    def __init__(self, config, wordTags):
        self._config = config
        self.__posTagger = PartOfSpeechTagger(wordTags)

    def start(self):
        print "Traversing directory: " + self._config.input
        # If this is directory.
        if os.path.isdir( self._config.input ):
            self.createoutputdir(self._config.input);

            # Loop dir for files to parse.
            for f in os.listdir( self._config.input ):
                if os.path.isfile( os.path.normpath(self._config.input) + os.sep + f ):
                    self.parse( os.path.normpath(self._config.input) + os.sep + f )

        # else is a single file for parsing.
        else:
            directory = os.path.dirname( self._config.input )
            self.createoutputdir( directory );
            self.parse( self._config.input )

    def createoutputdir(self, directory):
        self.__outputDir = directory + '/output/'

        if os.path.exists(self.__outputDir) == False:
            os.makedirs(self.__outputDir)

    def parse(self, filepath):

        # Protect against hidden files.
        bn = os.path.basename(filepath)
        if bn.startswith('.') or bn.endswith('~'):
            return

        print "Parsing: "+ filepath

        # Read file.
        self.__input = open(filepath, 'r')

        # If basename is not xml then make sure output IS xml.
        if filepath[-3:] != "xml":
            bn = bn[:-3] + "xml"

        # Create output file for writing
        outputf = open(self.__outputDir + bn, 'w+');

        # Remove all newlines before passing to NLP lib.
        source = self.__input.read().decode('utf-8')
        # Replaces Unicode right and left quotation mark with apostrophe
        source = re.sub(u"(\u2018|\u2019)", "'", source)
        # Replaces Unicode mdash and ndash
        source = re.sub(u"(\u2013|\u2014)", "-", source)
        # Below will ignore
        source = source.encode('ascii', 'ignore')
        inputlines = source.splitlines()
        inputnolines = ' '.join([str(x).strip() for x in inputlines]).rstrip()

        # Parse poem through NLP lib.
        self.__posTagger.tag(inputnolines);
        output = self.__posTagger.getFormatted()
        # for element in output:
        #     print("%s - %s" % (element.get('class'), element.text))

        # Create ElementTree root for output.
        outputroot = etree.Element("output")
        outputtree = etree.ElementTree(outputroot)

        # Loop through lines in files.
        for line in inputlines:

            # Create base nodes.
            linenode = etree.SubElement(outputroot, "line")
            linetags = etree.SubElement(linenode, "tags")
            originalnode = etree.SubElement(linenode, "original")

            # For each line break into words and extract class
            # from NLP output. This assumes all words (excluding punctuation)
            # has been assigned a class by the POS tagger; including the
            # -NONE- class when a word can't be tagged.
            for word in line.split():
                for el in output:
                    eltext = str(el.text)
                    # print "Searching for: %s - %s" % (eltext, word[:len(eltext)])
                    if eltext == word[:len(eltext)]:
                        # print "MATCH! Type = %s" % el.get("class")
                        # Add to linenode.tags & remove from output.
                        if el.get("class") != "NONE":
                            linetags.append( deepcopy(el) );
                        el.getparent().remove( el );
                        break

            originalnode.text = etree.CDATA(line)

        outputtree.write(outputf, pretty_print=True)

        return self.__input

    def output(self):
        self.__input.write(self._config.output)

    def findnth(self, haystack, needle, n):
        parts= haystack.split(needle, n+1)
        if len(parts)<=n+1:
            return -1
        return len(haystack)-len(parts[-1])-len(needle)