experiments = [("word",), ("word", "pos"), ("word", "pos", "root"), ("word", "pos", "root", "rel")] experiments = experiments + [experiment + ("embeddings",) for experiment in experiments] experiments += [("embeddings",)] scores = np.zeros((10, len(experiments))) sizes = [] for e, experiment in enumerate(experiments): sizes = [] print experiment for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)): size = int(len(X_train_docs) * train_size) sizes.append(size) if experiment == ("embeddings",): features = FeatureStacker(("embeddings", WordEmbeddings(model))) experiment = ("word",) + experiment else: features = FeatureStacker(("windower", Windower(window_size=3)), ("embeddings", WordEmbeddings(model))) X_train = include_features(X_train_docs[:size], experiment) X_test = include_features(X_test_docs, experiment) X_train = features.fit_transform(X_train) X_test = features.transform(X_test) le = LabelEncoder() y_train = le.fit_transform(y_train_docs[: X_train.shape[0]]) y_test = le.transform(y_test_docs) clf = LogisticRegression(C=1.0) clf.fit(X_train, y_train) preds = clf.predict(X_test) scores[i, e] = f1_score(y_test, preds, average="micro")
for i, line in enumerate(infile): if limit is not None and i >= limit: break if line.startswith("<FB/>"): X.append([]) y.append([]) else: fields = line.strip().split('\t') X[-1].append([field if field else None for field in fields[:-1]]) assert X[-1] y[-1].append('yes' if fields[-1] == 'animate' else 'no') return X, y X, y = load_data(sys.argv[1], limit=None) model = Word2Vec.load_word2vec_format(sys.argv[2], binary=True) full_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3)), ('embeddings', WordEmbeddings(model))) backoff_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3))) X_full = full_feature_vectorizer.fit_transform([[word for word in doc] for doc in X]) X_backoff = backoff_feature_vectorizer.fit_transform([[word for word in doc] for doc in X]) y = LabelEncoder().fit_transform([l for labels in y for l in labels]) clf_full = LogisticRegression().fit(X_full, y) clf_backoff = LogisticRegression().fit(X_backoff, y) frogger = Frog(int(sys.argv[3])) for filename in glob.glob(os.path.join(sys.argv[4], "*")): print filename characters = Counter() with codecs.open(filename, encoding='utf-8') as infile: doc = infile.read()
experiments = experiments + [ experiment + ('embeddings', ) for experiment in experiments ] experiments += [('embeddings', )] scores = np.zeros((10, len(experiments))) sizes = [] for e, experiment in enumerate(experiments): sizes = [] print experiment for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)): size = int(len(X_train_docs) * train_size) sizes.append(size) if experiment == ('embeddings', ): features = FeatureStacker(('embeddings', WordEmbeddings(model))) experiment = ('word', ) + experiment else: features = FeatureStacker(('windower', Windower(window_size=3)), ('embeddings', WordEmbeddings(model))) X_train = include_features(X_train_docs[:size], experiment) X_test = include_features(X_test_docs, experiment) X_train = features.fit_transform(X_train) X_test = features.transform(X_test) le = LabelEncoder() y_train = le.fit_transform(y_train_docs[:X_train.shape[0]]) y_test = le.transform(y_test_docs) clf = LogisticRegression(C=1.0) clf.fit(X_train, y_train) preds = clf.predict(X_test) scores[i, e] = f1_score(y_test, preds, average='micro')