def main(): folders = {} folders["politik"] = "data/politik" folders["sport"] = "data/sport" folders["wirtschaft"] = "data/wirtschaft" bank = ClassBank() l = Loader() # train data for classname, folder in folders.iteritems(): count = 0 content = "" for file in os.listdir(folder + "/train/"): if file.endswith(".txt"): count = count + 1 content = content + " " + l.load_txt(folder + "/train/" + file) c = Class(classname, content, count) bank.addClass(c) bank.train() c = Classifier() # test data for classname, folder in folders.iteritems(): print "\n=== Testing",classname, "===\n" for file in os.listdir(folder + "/test/"): if file.endswith(".txt"): tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file)) classifiedClass = c.classify(tokenizer.getTokens(), bank) print file,"=",classifiedClass.getName()
def test_selfClass(self): """Check for good conversions""" testData = u"""\ A 000041 Lu B 000042 Lu C 000043 Lu a 000061 Ll b 000062 Ll c 000063 Ll 0 000030 Nd 1 000031 Nd 2 000032 Nd $ 000024 Sc = 00003d Sm * 00002a Po 愚 00611a Lo 公 00516c Lo 移 0079fb Lo 山 005c71 Lo """ for line in testData.split('\n'): line = line.strip() if not line: break A, B, C = line.split() if A and B and C: codepoint = ord(A) self.assertEqual(codepoint, int(B, 0x10), Self.doc()) self.assertEqual(Class.classify(codepoint), C, Self.doc())
def __init__(self): "Initialize grammars for antlr4" self.uniclass = Class() self.labelled = {label: [] for label in Class.label} that = None codepoint, top = 0x0, 0x110000 for codepoint in xrange(top): this = Class.classify(codepoint) if that != this: if that is not None: self.labelled[that][-1].append(codepoint) self.labelled[this].append([codepoint]) that = this that = Class.classify(top - 1) self.labelled[that][-1].append(codepoint) with open("local/PropertyValueAliases.txt") as source: find = 'gc ; ' self.prop = {'__': 'Error'} for line in source.readlines(): if line.startswith(find): part = line.split(';') self.prop[part[1].strip()] = part[2].strip() self.identify = [0] * top with open("local/Blocks.txt") as source: pattern = re.compile(r"([0-9A-F]{4,6})\.\.([0-9A-F]{4,6}); (.*)") self.block = {} for line in source: found = pattern.match(line) if found: self.block[found.group(3).replace(' ', '_')] = [ found.group(i) for i in [1, 2] ] self.noblock = '(Absent from Blocks.txt)' self.blockname = [self.noblock] + sorted(self.block.keys()) for i, name in enumerate(self.blockname): self.blockname[i] = name for i, name in enumerate(self.blockname): if isinstance(name, str) and name is not self.noblock: endpoint = self.block[name] A, B = (int(s, 0x10) for s in endpoint) for codepoint in xrange(A, B): self.identify[codepoint] = i