def calculate_kappa(filename): # save labels label_list = [] with open('data/' + filename + '_data_result.json') as json_file: tweets = json.load(json_file) for row in tweets: label_list.append(row['label']) # Generate two fake labels to calculate kappa man_1_label = change_some_values(label_list) man_2_label = change_some_values(label_list) # save the labels to a csv file save_to_csv('data/label_1.csv', man_1_label) save_to_csv('data/label_2.csv', man_2_label) # calculate inter annotator agreement civ_1 = ['c1'] * len(man_1_label) civ_2 = ['c2'] * len(man_2_label) item_num_list = range(0, len(man_1_label)) civ_1 = zip(civ_1, item_num_list, man_1_label) civ_2 = zip(civ_2, item_num_list, man_2_label) task_data = civ_1 + civ_2 task = AnnotationTask(data=task_data) # observed disagreement for the weighted kappa coefficient print 'kappa: ' + str(task.kappa())
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def preprocess(filename): # open file data = [] with open('data/' + filename + '_data.json') as json_file: tweets = json.load(json_file) # load tweet data to a list for index, tweet in enumerate(tweets): text = tweet['text'].encode('ascii', 'ignore') data.append(text) # preprocess stop_words = stopwords.words('english') tweet_list = preprocess_tweets(data, stop_words) save_to_csv('data/labelled_tweet.csv', tweet_list) # save labels label_list = [] with open('data/' + filename + '_data_result.json') as json_file: tweets = json.load(json_file) for row in tweets: label_list.append(row['label']) save_to_csv('data/label_api.csv', label_list)