def get_episode(episode_id): """ Show specific episode """ statement = """\ MATCH (e:Episode {id: {episodeId}})-[r:TOPIC]->(topic) WITH e, r, topic ORDER BY r.score DESC RETURN e.id AS id, e.title as title, e.season AS season, e.number AS number, COLLECT({id: topic.id, name: topic.value, score: r.score}) AS topics ORDER BY id """ episode = graph.cypher.execute(statement, {"episodeId": int(episode_id)})[0] season = episode["season"] number = episode["number"] sentences = [] with open("data/import/sentences.csv", "r") as sentences_file: reader = csv.reader(sentences_file, delimiter=",") reader.next() for row in reader: if int(row[1]) == int(episode['id']): tokenized_sentence = nltk.word_tokenize(row[4].decode('utf-8')) sentence_pos = nltk.pos_tag(tokenized_sentence) word_pos = [(word, classifier.classify( pos_features(tokenized_sentence, sentence_pos, i))) for i, word in enumerate(tokenized_sentence)] speaker = list( itertools.takewhile(lambda x: x[1] == True, word_pos)) sentences.append(("".join(s[0] for s in speaker), row[4])) transcript = open("data/transcripts/S%d-Ep%d" % (season, number)).read() soup = BeautifulSoup(transcript) rows = select(soup, "table.tablebg tr td.post-body div.postbody") return template("episode", episode=episode, transcript=rows[0], sentences=sentences)
def get_episode(episode_id): """ Show specific episode """ statement = """\ MATCH (e:Episode {id: {episodeId}})-[r:TOPIC]->(topic) WITH e, r, topic ORDER BY r.score DESC RETURN e.id AS id, e.title as title, e.season AS season, e.number AS number, COLLECT({id: topic.id, name: topic.value, score: r.score}) AS topics ORDER BY id """ episode = graph.cypher.execute(statement, {"episodeId": int(episode_id)})[0] season = episode["season"] number = episode["number"] sentences = [] with open("data/import/sentences.csv", "r") as sentences_file: reader = csv.reader(sentences_file, delimiter = ",") reader.next() for row in reader: if int(row[1]) == int(episode['id']): tokenized_sentence = nltk.word_tokenize(row[4].decode('utf-8')) sentence_pos = nltk.pos_tag(tokenized_sentence) word_pos = [(word, classifier.classify(pos_features(tokenized_sentence, sentence_pos, i))) for i, word in enumerate(tokenized_sentence)] speaker = list(itertools.takewhile(lambda x: x[1] == True, word_pos)) sentences.append(("".join(s[0] for s in speaker), row[4])) transcript = open("data/transcripts/S%d-Ep%d" %(season, number)).read() soup = BeautifulSoup(transcript) rows = select(soup, "table.tablebg tr td.post-body div.postbody") return template("episode", episode = episode, transcript = rows[0], sentences = sentences)
import nltk from nltk import ClassifierI from himymutil.ml import pos_features class NaiveClassifier(ClassifierI): def classify(self, featureset): if featureset['next-word'] == ":": return True else: return False if __name__ == '__main__': classifier = NaiveClassifier() sentence = "Ted from 2030: Oh,we were bigfansofNewYork'sannualHalloweenparade.Idon'tmeantheonethattakesplaceHalloweennightintheVillage.ImeantheonethattakesplacethemorningofNovember1st,theAnnualPostHalloweenWalkofShameParade." tokenized_sentence = nltk.word_tokenize(sentence) for i, word in enumerate(tokenized_sentence): print "{0} -> {1}".format(word, classifier.classify(pos_features(tokenized_sentence, i)))
from sklearn.cross_validation import train_test_split with open("data/import/trained_sentences.json", "r") as json_file: json_data = json.load(json_file) tagged_sents = [] for sentence in json_data: tagged_sents.append([(word["word"], word["speaker"]) for word in sentence["words"]]) featuresets = [] for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) sentence_pos = nltk.pos_tag(untagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, sentence_pos, i), tag)) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() X = vec.fit_transform([item[0] for item in featuresets]).toarray() #>> > len(X) #>> > len(X[0]) #>> > vec.get_feature_names()[10:15] vec = DictVectorizer() X = vec.fit_transform([item[0] for item in featuresets]).toarray() Y = [item[1] for item in featuresets] X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
def extract_speaker(sentence): tokenized_sentence = nltk.word_tokenize(sentence) for i, word in enumerate(tokenized_sentence): classification = classifier.classify( pos_features(tokenized_sentence, i))
def extract_speaker(sentence): tokenized_sentence = nltk.word_tokenize(sentence) for i, word in enumerate(tokenized_sentence): classification = classifier.classify(pos_features(tokenized_sentence, i))
from himymutil.naive import NaiveClassifier from himymutil.ml import pos_features, assess_classifier with open("data/import/trained_sentences.json", "r") as json_file: json_data = json.load(json_file) tagged_sents = [] for sentence in json_data: tagged_sents.append([(word["word"], word["speaker"]) for word in sentence["words"]]) featuresets = [] for tagged_sent in tagged_sents: untagged_sent = nltk.tag.untag(tagged_sent) sentence_pos = nltk.pos_tag(untagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, sentence_pos, i), tag)) train_data, test_data = train_test_split(featuresets, test_size=0.20, train_size=0.80) table = [] table.append(assess_classifier(NaiveClassifier(), test_data, "Naive")) table.append(assess_classifier(nltk.NaiveBayesClassifier.train(train_data), test_data, "Naive Bayes")) table.append(assess_classifier(nltk.DecisionTreeClassifier.train(train_data), test_data, "Decision Tree All In")) def get_rid_of(entry, *keys): for key in keys: del entry[key] tmp_train_data = copy.deepcopy(train_data)