('embeddings', WordEmbeddings(model))) backoff_feature_vectorizer = FeatureStacker(('windower', Windower(window_size=3))) X_full = full_feature_vectorizer.fit_transform([[word for word in doc] for doc in X]) X_backoff = backoff_feature_vectorizer.fit_transform([[word for word in doc] for doc in X]) y = LabelEncoder().fit_transform([l for labels in y for l in labels]) clf_full = LogisticRegression().fit(X_full, y) clf_backoff = LogisticRegression().fit(X_backoff, y) frogger = Frog(int(sys.argv[3])) for filename in glob.glob(os.path.join(sys.argv[4], "*")): print filename characters = Counter() with codecs.open(filename, encoding='utf-8') as infile: doc = infile.read() document = frogger.tag(doc) document = [[f.decode('utf-8') for f in w[:-1]] for sentence in document for w in sentence] words = [word[0] for word in document] X_test_full = full_feature_vectorizer.transform([document]) X_test_backoff = backoff_feature_vectorizer.transform([document]) for i, word in enumerate(X_test_full): if words[i].lower() not in model: pred = clf_backoff.predict(X_test_backoff[i])[0] else: pred = clf_full.predict(X_test_full[i])[0] if pred == 1 and document[i][2] in ('N', 'SPEC'): characters[document[i][0]] += 1 print ', '.join(sorted(characters, key=characters.__getitem__, reverse=True))
for filename in glob.glob(os.path.join(sys.argv[2], "SINVS*.ann")): if 'anomalies' in filename: continue story = Story.load(filename) if sys.argv[3] == 'chars': characters = {(start, end): (id, name) for character in story.characters for id, name, start, end in character.chain} else: characters = {(start, end): (id, name) for location in story.locations for id, name, start, end in location.chain} with codecs.open(filename.replace(".ann", ".txt"), encoding='utf-8') as f: orig_text = f.read() tokens = [word for sent in frogger.tag(orig_text) for word in sent] offsets = list( token_boundaries([t[0].decode('utf-8') for t in tokens], orig_text)) found_characters = {(start, end): False for start, end in characters} for char_start, char_end in characters: start_found = False for i, (start, end) in enumerate(offsets): if (start == char_start or (start < char_start < end) or (char_start < start < char_end)): start_found = True tokens[i] = Word(*(tokens[i][:-1] + ('animate', ))) for token in tokens: print '\t'.join(token) print '<FB/>'
for filename in glob.glob(os.path.join(sys.argv[2], "SINVS*.ann")): if 'anomalies' in filename: continue story = Story.load(filename) if sys.argv[3] == 'chars': characters = {(start, end): (id, name) for character in story.characters for id, name, start, end in character.chain} else: characters = {(start, end): (id, name) for location in story.locations for id, name, start, end in location.chain} with codecs.open(filename.replace(".ann", ".txt"), encoding='utf-8') as f: orig_text = f.read() tokens = [word for sent in frogger.tag(orig_text) for word in sent] offsets = list(token_boundaries([t[0].decode('utf-8') for t in tokens], orig_text)) found_characters = {(start, end): False for start, end in characters} for char_start, char_end in characters: start_found = False for i, (start, end) in enumerate(offsets): if (start == char_start or (start < char_start < end) or (char_start < start < char_end)): start_found = True tokens[i] = Word(*(tokens[i][:-1] + ('animate',))) for token in tokens: print '\t'.join(token) print '<FB/>'