experiments = [("word",), ("word", "pos"), ("word", "pos", "root"), ("word", "pos", "root", "rel")]
experiments = experiments + [experiment + ("embeddings",) for experiment in experiments]
experiments += [("embeddings",)]

scores = np.zeros((10, len(experiments)))
sizes = []

for e, experiment in enumerate(experiments):
    sizes = []
    print experiment
    for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)):
        size = int(len(X_train_docs) * train_size)
        sizes.append(size)
        if experiment == ("embeddings",):
            features = FeatureStacker(("embeddings", WordEmbeddings(model)))
            experiment = ("word",) + experiment
        else:
            features = FeatureStacker(("windower", Windower(window_size=3)), ("embeddings", WordEmbeddings(model)))
        X_train = include_features(X_train_docs[:size], experiment)
        X_test = include_features(X_test_docs, experiment)
        X_train = features.fit_transform(X_train)
        X_test = features.transform(X_test)
        le = LabelEncoder()
        y_train = le.fit_transform(y_train_docs[: X_train.shape[0]])
        y_test = le.transform(y_test_docs)
        clf = LogisticRegression(C=1.0)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        scores[i, e] = f1_score(y_test, preds, average="micro")
Exemple #2
0
        for i, line in enumerate(infile):
            if limit is not None and i >= limit:
                break
            if line.startswith("<FB/>"):
                X.append([])
                y.append([])
            else:
                fields = line.strip().split('\t')
                X[-1].append([field if field else None for field in fields[:-1]])
                assert X[-1]
                y[-1].append('yes' if fields[-1] == 'animate' else 'no')
    return X, y

X, y = load_data(sys.argv[1], limit=None)
model = Word2Vec.load_word2vec_format(sys.argv[2], binary=True)
full_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3)),
                                         ('embeddings', WordEmbeddings(model)))
backoff_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3)))

X_full = full_feature_vectorizer.fit_transform([[word for word in doc] for doc in X])
X_backoff = backoff_feature_vectorizer.fit_transform([[word for word in doc] for doc in X])
y = LabelEncoder().fit_transform([l for labels in y for l in labels])

clf_full = LogisticRegression().fit(X_full, y)
clf_backoff = LogisticRegression().fit(X_backoff, y)
frogger = Frog(int(sys.argv[3]))

for filename in glob.glob(os.path.join(sys.argv[4], "*")):
    print filename
    characters = Counter()
    with codecs.open(filename, encoding='utf-8') as infile:
        doc = infile.read()
experiments = experiments + [
    experiment + ('embeddings', ) for experiment in experiments
]
experiments += [('embeddings', )]

scores = np.zeros((10, len(experiments)))
sizes = []

for e, experiment in enumerate(experiments):
    sizes = []
    print experiment
    for i, train_size in enumerate(np.arange(0.1, 1.1, 0.1)):
        size = int(len(X_train_docs) * train_size)
        sizes.append(size)
        if experiment == ('embeddings', ):
            features = FeatureStacker(('embeddings', WordEmbeddings(model)))
            experiment = ('word', ) + experiment
        else:
            features = FeatureStacker(('windower', Windower(window_size=3)),
                                      ('embeddings', WordEmbeddings(model)))
        X_train = include_features(X_train_docs[:size], experiment)
        X_test = include_features(X_test_docs, experiment)
        X_train = features.fit_transform(X_train)
        X_test = features.transform(X_test)
        le = LabelEncoder()
        y_train = le.fit_transform(y_train_docs[:X_train.shape[0]])
        y_test = le.transform(y_test_docs)
        clf = LogisticRegression(C=1.0)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        scores[i, e] = f1_score(y_test, preds, average='micro')