def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) defaultdata = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll" parser = argparse.ArgumentParser(description="Baselines") parser.add_argument('--train', help="parsed-and-label input format", default=defaultdata) args = parser.parse_args() labels = [] featuredicts = [] print("Collecting features...") count=0 for s in readSentences(args.train): print("\r"+str(count), end="") count+=1 for l,i in zip(s["label"],s["idx"]): if l != "-": w = WordInContext(s, i, s["form"][i],s["lemma"][i],s["pos"][i],s["ne"][i],l,s["head"],s["deprel"]) featuredicts.append(w.baselinefeatures()) labels.append(w.label) print() vec = DictVectorizer() features = vec.fit_transform(featuredicts).toarray() labels = np.array(labels) classifiers = [LogisticRegression(penalty='l1'),LogisticRegression(penalty='l2'),SGDClassifier(),tree.DecisionTreeClassifier(),dummy.DummyClassifier(strategy="most_frequent") ] scores = defaultdict(list) for classifier in classifiers: for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=1): TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] classifier.fit(TrainX_i,Trainy_i) ypred_i = classifier.predict(TestX_i) scores["Accuracy"].append(accuracy_score(ypred_i,Testy_i)) scores["F1"].append(f1_score(ypred_i,Testy_i)) scores["Precision"].append(precision_score(ypred_i,Testy_i)) scores["Recall"].append(recall_score(ypred_i,Testy_i)) print("--", str(classifier)) for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) sys.exit(0)
def main(): #global brownclusters, cluster_heights, ave_brown_depth, ave_brown_height, max_brown_depth, embeddings #brownclusters, cluster_heights, ave_brown_depth, ave_brown_height, max_brown_depth=read_brown_clusters('/coastal/brown_clusters/rcv1.64M-c1000-p1.paths', 1000) #embeddings=read_embeddings('/coastal/mono_embeddings/glove.6B.300d.txt.gz') scriptdir = os.path.dirname(os.path.realpath(__file__)) defaultdata = scriptdir+"/../data/cwi_training/cwi_training_allannotations.txt.lbl.conll" parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016") parser.add_argument('--train', help="parsed-and-label input format", default=defaultdata) parser.add_argument('--instance_weighting', choices=["uniform","linear","inverse_class_relevance","log_and_mode","tf_idf","log_and_max"], default="uniform") args = parser.parse_args() labels = [] featuredicts = [] #print("Collecting features...") count=0 positive_votes = [] for s in readSentences(args.train): #print("\r"+str(count), end="") count+=1 for l,i in zip(s["label"],s["idx"]): if l != "-": w = WordInContext(s, i, s["form"][i],s["lemma"][i],s["pos"][i],s["ne"][i],positive_votes=l,heads=s["head"],deprels=s["deprel"]) featuredicts.append(w.featurize()) labels.append(w.label) positive_votes.append(w.positive_votes) vec = DictVectorizer() features = vec.fit_transform(featuredicts).toarray() labels = np.array(labels) positive_votes = np.array(positive_votes) learners = [tree.DecisionTreeClassifier(), svm.NuSVC(nu=0.2)] for learner in learners: scores = defaultdict(list) for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None): TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] sampleweights_i = get_sample_weights(positive_votes[TrainIndices], args.instance_weighting) #print(sampleweights_i) TestX_i = features[TestIndices] Testy_i = labels[TestIndices] learner.fit(TrainX_i,Trainy_i,sample_weight=sampleweights_i) ypred_i = learner.predict(TestX_i) acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) print("--") print(learner) for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--")