def __init__( self, dir, testDir=None, doTest=True, ignoreKlass=[], includeKlass=None, extractor="ArticleExtractor", useHtml=False, ): RssDataReader.__init__(self, dir, testDir) logger.info("Start building " + self.__class__.__name__) self.__mutex = threading.Semaphore() freqDists = {} ignore = stopwords.words("english") features = set() klassSize = {} documentsWithLabel = [] for klassId in self.klasses(ignoreKlass, includeKlass): freqDist = FreqDist() size = 0 for url, document in self.documents(klassId, useHtml): try: txt = document if not useHtml else Extractor(extractor=extractor, html=document).getText() documentsWithLabel.append((txt, klassId)) txt = tokenize(txt) size += 1 for part in txt: if part.isalnum() and part not in ignore: freqDist.inc(part) features.add(part) # for bigram in nltk.bigrams(txt): # freqDist.inc(bigram) # featureFd.inc(bigram) except: logger.exception(u"Url: " + url) freqDists[klassId] = freqDist klassSize[klassId] = size random.shuffle(documentsWithLabel) self.__featuresGenerator = FeatureGenerator(freqDists, features, klassSize) trainset = apply_features(self.__featuresGenerator, documentsWithLabel) self.__classifier = NaiveBayesClassifier.train(trainset) logger.info(u"Classifier learned (set size=" + unicode(len(trainset)) + u")") if doTest: ref = [] test = [] testDocumentsWithLabel = [ ( document if not useHtml else Extractor(extractor=extractor, html=document).getText(), correctKlass, url, ) for correctKlass in self.klasses(ignoreKlass, includeKlass) for url, document in self._testDocuments(correctKlass, useHtml) ] for doc, cat, url in testDocumentsWithLabel: ans = self.__classifier.classify(self.__featuresGenerator(doc)) ref.append(cat) test.append(ans) if ans != cat: logger.info(u"Wrong " + ans + u"(" + cat + u"):\t" + url + u" " + doc.replace("\n", " ")) # for correctKlass, klass, featuresWithLabel in zip(ref, test, testset): # if correctKlass != klass: # pd = self.__classifier.prob_classify(dict(featuresWithLabel[0])) # labelProbList = sorted( [(sample, pd.logprob(sample)) for sample in pd.samples()], key=lambda x: x[1], reverse=True) # logger.info( correctKlass + " as " + klass + ": " + str([(correctKlass, "%.2f" % prob) for correctKlass, prob in labelProbList])) # logger.info([(key, value)for key, value in featuresWithLabel[0].items() if value > 0]) # logger.info(self.__findDocumentByKlassAndFeatures(correctKlass, featuresWithLabel[0])) logger.info("\n" + ConfusionMatrix(ref, test).pp()) # testset = apply_features(self.__featuresGenerator, testDocumentsWithLabel # logger.info("Accuracy: " + str(nltk.classify.accuracy(self.__classifier, testset))) self.__classifier.show_most_informative_features(n=300)
# logger.info(u"Lang of \"" + u + u"\" is " + unicode(ld.detect(text))) if __name__ == "__main__": # # klass2Lang = {"us": "en", "nl_nl": "nl", "fr": "fr", "de": "de","es": "es", "pt-PT_pt": "pt", "pl_pl": "pl", "ru_ru": "ru", "it": "it", "tr_tr": "tr", "cn": "cn"} unknownLangs = ["cn"] if len(sys.argv) > 1 and sys.argv[1] == "-s": RssAnalyzer("/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/langid/", langs=klass2Lang.keys(), langAsKlass=True) else: ld = LangDetect.instance() logger.info(ld.detect(u"li ul li ul li ul")) logger.info(ld.detect(u"<li> <ul> <li> <ul><li> <ul><li> <ul><li> <ul>")) data = RssDataReader("/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/langid/") ref = [] response = [] for klass in data.klasses(): correctAns = klass2Lang[klass] if correctAns in unknownLangs: correctAns = "n/k" for url, doc in data.documents(klass): if not doc: logger.info("Empty: " + url) continue extractor = Extractor(extractor='ArticleExtractor', html=doc) ref.append(correctAns) ans = ld.detect(doc) ans = ans if ans else "n/k" response.append(ans)