def test01(self): g = EncodingGuesser() filename = os.path.join(TESTDATA, "gnosis-readme") self.assertEqual(g.guess(filename), None) filename = os.path.join(TESTDATA, "cp850a.txt") self.assertEqual(g.guess(filename), "cp850") filename = os.path.join(TESTDATA, "cp850b.txt") self.assertEqual(g.guess(filename), "cp850") filename = os.path.join(TESTDATA, "README.TXT") self.assertEqual(g.guess(filename), "cp850") filename = os.path.join(TESTDATA, "cp1252a.txt") self.assertEqual(g.guess(filename), "cp1252") filename = os.path.join(TESTDATA, "cp1252b.txt") self.assertEqual(g.guess(filename), "cp1252")
def test01(self): g = EncodingGuesser() filename = os.path.join(TESTDATA,"gnosis-readme") self.assertEqual(g.guess(filename),None) filename = os.path.join(TESTDATA,"cp850a.txt") self.assertEqual(g.guess(filename),"cp850") filename = os.path.join(TESTDATA,"cp850b.txt") self.assertEqual(g.guess(filename),"cp850") filename = os.path.join(TESTDATA,"README.TXT") self.assertEqual(g.guess(filename),"cp850") filename = os.path.join(TESTDATA,"cp1252a.txt") self.assertEqual(g.guess(filename),"cp1252") filename = os.path.join(TESTDATA,"cp1252b.txt") self.assertEqual(g.guess(filename),"cp1252")
def __init__(self, vol): #Task.__init__(self) self.volume = vol self.encodingGuesser = EncodingGuesser()
class FileVisitor: # (Task): # used? def __init__(self, vol): #Task.__init__(self) self.volume = vol self.encodingGuesser = EncodingGuesser() def looper(self, task): self.task = task sess = self.volume.getContext() #from lino.apps.keeper import tables self.ftypes = sess.query(tables.FileType) self.files = sess.query(tables.File) self.dirs = sess.query(tables.Directory) self.words = sess.query(tables.Word) self.occurences = sess.query(tables.Occurence) self.volume.directories().deleteAll() #for row in self.dirs.query(volume=self.volume): # row.delete() self.visit(self.volume.path, "") ## def getLabel(self): ## return "Loading "+self.volume.getLabel() def visit_file(self, fileRow, name): base, ext = os.path.splitext(name) # if ext.lower() == ".txt": self.status(name) s = open(name).read() coding = self.encodingGuesser.guess(name, s) self.status("%s: %s", name, coding) #print name,":",coding if coding: tokens = standardTokenizer(s.decode(coding)) else: tokens = standardTokenizer(s) #coding = guesscoding(name) #f = codecs.open(name,encoding=coding) #tokens = standardTokenizer(f.read()) #tokens = open(name).read().split() self.loadWords(fileRow, tokens) ## count = 0 ## for ln in open(name).readlines(): ## for w in ln.split(): ## count += 1 ## self.verbose("%s contains %d words.", name, count) elif ext == ".doc": self.status("Ignoring MS-Word %s.", name) #msdoc = MsWordDocument(name) #fileRow.title = msdoc.title #self.loadWords(fileRow,msdoc.content.split()) else: self.status("Ignoring unknown file %s.", name) def loadWords(self, fileRow, tokens): #self.status("%s : %d words",fileRow.name,len(tokens)) #print fileRow.path(), ".occurences.deleteAll()" fileRow.occurences.deleteAll() #self.occurences.query(file=deleteRows(file=fileRow) pos = 0 for token in tokens: pos += 1 self.status(fileRow.path() + ": " + token) word = self.words.peek(token) if word is None: word = self.words.appendRow(id=token) #elif word.ignore: # continue fileRow.occurences.appendRow(word=word, pos=pos)
#self.status("%s : %d words",fileRow.name,len(tokens)) #print fileRow.path(), ".occurences.deleteAll()" fileRow.occurences.deleteAll() #self.occurences.query(file=deleteRows(file=fileRow) pos = 0 for token in tokens: pos += 1 self.status(fileRow.path() + ": " + token) word = self.words.peek(token) if word is None: word = self.words.appendRow(id=token) #elif word.ignore: # continue fileRow.occurences.appendRow(word=word, pos=pos) encodingGuesser = EncodingGuesser() def get_reader(fullname): base, ext = os.path.splitext(fullname) try: return readers[ext.lower()] except KeyError, e: return non_reader def read_content(sess, fileInstance, fullname): r = get_reader(fullname) return r(sess, fileInstance, fullname)
def __init__(self,vol): #Task.__init__(self) self.volume = vol self.encodingGuesser = EncodingGuesser()
class FileVisitor: # (Task): # used? def __init__(self,vol): #Task.__init__(self) self.volume = vol self.encodingGuesser = EncodingGuesser() def looper(self,task): self.task=task sess = self.volume.getContext() #from lino.apps.keeper import tables self.ftypes = sess.query(tables.FileType) self.files = sess.query(tables.File) self.dirs = sess.query(tables.Directory) self.words = sess.query(tables.Word) self.occurences = sess.query(tables.Occurence) self.volume.directories().deleteAll() #for row in self.dirs.query(volume=self.volume): # row.delete() self.visit(self.volume.path,"") ## def getLabel(self): ## return "Loading "+self.volume.getLabel() def visit_file(self,fileRow,name): base,ext = os.path.splitext(name) # if ext.lower() == ".txt": self.status(name) s = open(name).read() coding = self.encodingGuesser.guess(name,s) self.status("%s: %s", name,coding) #print name,":",coding if coding: tokens = standardTokenizer(s.decode(coding)) else: tokens = standardTokenizer(s) #coding = guesscoding(name) #f = codecs.open(name,encoding=coding) #tokens = standardTokenizer(f.read()) #tokens = open(name).read().split() self.loadWords(fileRow,tokens) ## count = 0 ## for ln in open(name).readlines(): ## for w in ln.split(): ## count += 1 ## self.verbose("%s contains %d words.", name, count) elif ext == ".doc": self.status("Ignoring MS-Word %s.", name) #msdoc = MsWordDocument(name) #fileRow.title = msdoc.title #self.loadWords(fileRow,msdoc.content.split()) else: self.status("Ignoring unknown file %s.", name) def loadWords(self,fileRow,tokens): #self.status("%s : %d words",fileRow.name,len(tokens)) #print fileRow.path(), ".occurences.deleteAll()" fileRow.occurences.deleteAll() #self.occurences.query(file=deleteRows(file=fileRow) pos = 0 for token in tokens: pos += 1 self.status(fileRow.path()+": "+token) word = self.words.peek(token) if word is None: word = self.words.appendRow(id=token) #elif word.ignore: # continue fileRow.occurences.appendRow(word=word, pos=pos)