class Collector: """Collector application""" def __init__(self, config, wordTags): self._config = config self.__traverse = Traverse(config.dirs) self.__posTagger = PartOfSpeechTagger(wordTags) self.__taggedData = dict() self.__initOutputs() def start(self): self.__traverse.restrict({'maxSize': 20*1024*1024 }) # Debugging. # self.__traverse.allowTypes(['application/rtf']) self.__traverse.allowTypes(self._config.acceptedFiles) self.__traverse.onFilePass += self.__fileFoundHandler self.__traverse.start() self.__output.write(json.dumps(self.__taggedData)) self.__progressLog.truncate(0) def __initOutputs(self): # now = datetime.datetime.now() # nowFormatted = now.strftime("%y-%m-%d-%H%M%S") # Need this file to be consitent when app is run multiple times. nowFormatted = '' # Progress progressLog = 'collector/progress%s.log' % nowFormatted # Clear log. open(progressLog,'w').close() # Open in append mode. self.__progressLog = open(progressLog, 'a') # Output output = '%scollector%s.json' % (self._config.output, nowFormatted) try: self.__output = open(output, 'w') except IOError as e: print "[ERROR] Directory %s does not exist. Check config.cfg and create directory if necessary" % self._config.output raise def __fileFoundHandler(self, filePath, mimeType): # Write to stdout. print "File found: %s - %s" % (filePath, mimeType) # Open file (read only). try: with open(filePath, 'r') as f: data = f.read() except IOError: self.__handleIOError(filePath) else: # Get parser dependant on mimetype and do some taggging. parser = self.__initParser(data, filePath, mimeType); parsedData = parser.parse() # Use POSTagger to tag words in parsed text/data. self.__posTagger.tag(parsedData) self.__merge(self.__posTagger.getByClass()) # If merge is successful write this filePath to progres log. self.__progressLog.write(filePath+'\n') f.close() def __initParser(self, data, filePath, mimeType): config = { "acceptedTags": self._config.htmltags, "filePath": filePath } return { 'text/plain': TextParser(data), 'text/html': HTMLDocParser(data, config), 'application/rtf': RTFParser(data, config), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': MSDocXParser('', config), 'application/vnd.oasis.opendocument.text': ODTParser('', config), 'application/pdf': PDFParser('', config), #'application/msword': RTFDocParser(data, config), }[mimeType] def __merge(self, newData): # Alphabetise words for grouping. for tagKey in newData: newData[tagKey] = sorted(newData[tagKey]) if len(self.__taggedData) == 0: # No data yet as this is first file and this is not a # continuation of previous scan. for tagKey in newData: self.__taggedData[tagKey] = {key: len(list(group)) for key, group in groupby(newData[tagKey])} else: # Either data exists from prior files or from a partial json # file created during a previous scan. for tagKey in newData: for word in newData[tagKey]: if word in self.__taggedData[tagKey]: self.__taggedData[tagKey][word] = self.__taggedData[tagKey][word] + 1 else: self.__taggedData[tagKey][word] = 1 def __orderByFrequency(self): for tagKey in self.__taggedData: self.__taggedData[tagKey] = sorted(self.__taggedData[tagKey].iteritems(), key=operator.itemgetter(1), reverse=True) print self.__taggedData[tagKey] def __handleIOError(self, filePath): # If IO error then write mask, user and group to log. statInfo = os.stat(filePath) user = pwd.getpwuid(statInfo.st_uid)[0] group = grp.getgrgid(statInfo.st_gid)[0] permMask = str(oct(statInfo[ST_MODE] & 0777)) errorMsg = 'Cannot open file %s. [mask: %s, user: %s, group: %s]' % (filePath, permMask, user, group) logging.error(errorMsg) print '[ERROR] ' + errorMsg
def __init__(self, config, wordTags): self._config = config self.__posTagger = PartOfSpeechTagger(wordTags)
def __init__(self, config, wordTags): self._config = config self.__traverse = Traverse(config.dirs) self.__posTagger = PartOfSpeechTagger(wordTags) self.__taggedData = dict() self.__initOutputs()
class Poetic: """Poetic parser """ def __init__(self, config, wordTags): self._config = config self.__posTagger = PartOfSpeechTagger(wordTags) def start(self): print "Traversing directory: " + self._config.input # If this is directory. if os.path.isdir( self._config.input ): self.createoutputdir(self._config.input); # Loop dir for files to parse. for f in os.listdir( self._config.input ): if os.path.isfile( os.path.normpath(self._config.input) + os.sep + f ): self.parse( os.path.normpath(self._config.input) + os.sep + f ) # else is a single file for parsing. else: directory = os.path.dirname( self._config.input ) self.createoutputdir( directory ); self.parse( self._config.input ) def createoutputdir(self, directory): self.__outputDir = directory + '/output/' if os.path.exists(self.__outputDir) == False: os.makedirs(self.__outputDir) def parse(self, filepath): # Protect against hidden files. bn = os.path.basename(filepath) if bn.startswith('.') or bn.endswith('~'): return print "Parsing: "+ filepath # Read file. self.__input = open(filepath, 'r') # If basename is not xml then make sure output IS xml. if filepath[-3:] != "xml": bn = bn[:-3] + "xml" # Create output file for writing outputf = open(self.__outputDir + bn, 'w+'); # Remove all newlines before passing to NLP lib. source = self.__input.read().decode('utf-8') # Replaces Unicode right and left quotation mark with apostrophe source = re.sub(u"(\u2018|\u2019)", "'", source) # Replaces Unicode mdash and ndash source = re.sub(u"(\u2013|\u2014)", "-", source) # Below will ignore source = source.encode('ascii', 'ignore') inputlines = source.splitlines() inputnolines = ' '.join([str(x).strip() for x in inputlines]).rstrip() # Parse poem through NLP lib. self.__posTagger.tag(inputnolines); output = self.__posTagger.getFormatted() # for element in output: # print("%s - %s" % (element.get('class'), element.text)) # Create ElementTree root for output. outputroot = etree.Element("output") outputtree = etree.ElementTree(outputroot) # Loop through lines in files. for line in inputlines: # Create base nodes. linenode = etree.SubElement(outputroot, "line") linetags = etree.SubElement(linenode, "tags") originalnode = etree.SubElement(linenode, "original") # For each line break into words and extract class # from NLP output. This assumes all words (excluding punctuation) # has been assigned a class by the POS tagger; including the # -NONE- class when a word can't be tagged. for word in line.split(): for el in output: eltext = str(el.text) # print "Searching for: %s - %s" % (eltext, word[:len(eltext)]) if eltext == word[:len(eltext)]: # print "MATCH! Type = %s" % el.get("class") # Add to linenode.tags & remove from output. if el.get("class") != "NONE": linetags.append( deepcopy(el) ); el.getparent().remove( el ); break originalnode.text = etree.CDATA(line) outputtree.write(outputf, pretty_print=True) return self.__input def output(self): self.__input.write(self._config.output) def findnth(self, haystack, needle, n): parts= haystack.split(needle, n+1) if len(parts)<=n+1: return -1 return len(haystack)-len(parts[-1])-len(needle)