def DoTest(classA, classB, unsupervised=False, balance=False, use_baseline_segmenter=False): classA = list(classA) classB = list(classB) if balance: max_len = min(len(classA), len(classB)) classA = classA[:max_len] classB = classB[:max_len] random.shuffle(classA) random.shuffle(classB) percents = (0.2, 0.1, 0.7) classA_test, classA_validation, classA_train = Partition(classA, percents) classB_test, classB_validation, classB_train = Partition(classB, percents) if use_baseline_segmenter: seg_func = segmenter.baseline_segmenter else: seg_func = segmenter.morph_segmenter(model) classifier = BinaryClassifier.Train(seg_func, classA_train, classB_train) classifier.GetTopRatios() thresh = GetOptimalThreshold(classifier, classA_validation, classB_validation) acc = TestAccuracy(classifier, classA_test, classB_test, thresh) print 'test accuracy {0}'.format(acc) if unsupervised: semisup_classifier = classifier for iter_num in range(3): print 'semi-sup iter {0}'.format(iter_num) semisup_classifier.TrainConfidenceEstimator( classA_validation, classB_validation) semisup_classifier = BinaryClassifier.TrainSemiSupervised( snapchat_names, semisup_classifier) thresh = GetOptimalThreshold(semisup_classifier, classA_validation, classB_validation) acc = TestAccuracy(semisup_classifier, classA_test, classB_test, thresh) print 'accuracy {0}'.format(acc)
def DoTest(classA, classB, unsupervised=False, balance=False, use_baseline_segmenter=False): classA = list(classA) classB = list(classB) if balance: max_len = min(len(classA), len(classB)) classA = classA[:max_len] classB = classB[:max_len] random.shuffle(classA) random.shuffle(classB) percents = (0.2, 0.1, 0.7) classA_test, classA_validation, classA_train = Partition(classA, percents) classB_test, classB_validation, classB_train = Partition(classB, percents) if use_baseline_segmenter: seg_func = segmenter.baseline_segmenter else: seg_func = segmenter.morph_segmenter(model) classifier = BinaryClassifier.Train(seg_func, classA_train, classB_train) classifier.GetTopRatios() thresh = GetOptimalThreshold(classifier, classA_validation, classB_validation) acc = TestAccuracy(classifier, classA_test, classB_test, thresh) print 'test accuracy {0}'.format(acc) if unsupervised: semisup_classifier = classifier for iter_num in range(3): print 'semi-sup iter {0}'.format(iter_num) semisup_classifier.TrainConfidenceEstimator(classA_validation, classB_validation) semisup_classifier = BinaryClassifier.TrainSemiSupervised(snapchat_names, semisup_classifier) thresh = GetOptimalThreshold(semisup_classifier, classA_validation, classB_validation) acc = TestAccuracy(semisup_classifier, classA_test, classB_test, thresh) print 'accuracy {0}'.format(acc)
acc = TestAccuracy(semisup_classifier, classA_test, classB_test, thresh) print 'accuracy {0}'.format(acc) segfun = semisup_classifier.segfun with open("../models/init_nonum_semi%i.pkl" % (iter_num + 1), 'w') as f: semisup_classifier.segfun = None cPickle.dump(semisup_classifier, f) semisup_classifier.segfun = segfun return semisup_classifier if __name__ == '__main__': target = 'gender' whereclause = "where gender is not ''" if target == 'gender' else '' model = segmenter.load_model('../models/idmorphs_naworl.model') segfun = segmenter.morph_segmenter(model, match='[a-z]+') # model_semi = segmenter.load_model('../models/idmorphs.model') # segfun_semi = segmenter.morph_segmenter(model_semi, match='[a-z]+') users = segmenter.get_users_from_db(whereclause=whereclause) male_ids = [user.id for user in users if user.gender == 'M'] female_ids = [user.id for user in users if user.gender == 'F'] # unlabeled_users = segmenter.get_users_from_db(tablename='naver') # unknown_ids = [user.id for user in unlabeled_users] unknown_ids = None cls = DoTest(male_ids, female_ids, segfun, unknown_ids, balance=False) # cls = cPickle.load(open("../models/init_nonum_semi3.pkl")) # cls.segfun = segfun_semi
result = classifier.Classify(name) result['lang'] = lang result['name'] = name results.append(result) return pandas.DataFrame(results) def get_preds(baseline, morph, weight): columns = numpy.array(['True', 'False']) z = weight * baseline[columns] + (1.0 - weight) * morph[columns] idx = z.values.argmax(axis=1) return columns[idx] base_segmenter = segmenter.baseline_segmenter morph_segmenter = segmenter.morph_segmenter(Classifier.model) def getMetrics(truelabels, predlabels): prec = metrics.precision_score(truelabels, predlabels, pos_label='True') recall = metrics.recall_score(truelabels, predlabels, pos_label='True') return prec, recall all_langs = train.lang.unique() for lang in all_langs: labels = [str(x) for x in train.lang == lang] testlabels = [str(x) for x in test.lang == lang] baseline_classifier = BayesClassifier.Train(base_segmenter, train.name_lower, labels) morph_classifier = BayesClassifier.Train(morph_segmenter,
result = classifier.Classify(name) result['lang'] = lang result['name'] = name results.append(result) return pandas.DataFrame(results) def get_preds(baseline, morph, weight): columns = numpy.array(['True', 'False']) z = weight * baseline[columns] + (1.0 - weight) * morph[columns] idx = z.values.argmax(axis=1) return columns[idx] base_segmenter = segmenter.baseline_segmenter morph_segmenter = segmenter.morph_segmenter(Classifier.model) def getMetrics(truelabels, predlabels): prec = metrics.precision_score(truelabels, predlabels, pos_label='True') recall = metrics.recall_score(truelabels, predlabels, pos_label='True') return prec, recall all_langs = train.lang.unique() for lang in all_langs: labels = [str(x) for x in train.lang == lang] testlabels = [str(x) for x in test.lang == lang] baseline_classifier = BayesClassifier.Train(base_segmenter, train.name_lower, labels) morph_classifier = BayesClassifier.Train(morph_segmenter, train.name_lower,