def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform( features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit( train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def __init__(self, P=None, **kwargs): if P is None: self.params = DEFAULT_PARAMS given = locals()['kwargs'] saved = load_params(given.get('name','missingNo')) if saved is None: saved = DEFAULT_PARAMS for key in DEFAULT_PARAMS: val = given.get(key,None) if val is None: val = saved.get(key,None) if val is None: continue self.set(key,val) if self.get('labels') is None: self.set('labels',ds.get_labels()) else: self.params = P
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)