def to_weka_arff(ngram, number_of_features):
    count_vect = TfidfVectorizer(ngram_range=(1, ngram),
                                 norm='l2',
                                 sublinear_tf=True)

    label_list = get_labels()
    tweet_list = get_labelled_tweets()

    features = count_vect.fit_transform(tweet_list)

    features = SelectKBest(chi2, k=number_of_features).fit_transform(
        features, label_list)
    print features.shape

    arff_data = []

    arff_data.append("@RELATION sport")

    for i in range(features.shape[1]):
        arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
    arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")

    arff_data.append("@DATA")

    array_features = features.toarray()
    for i in range(len(array_features)):
        feature = array_features[i]
        label = label_list[i]
        csv_feature = ",".join(str(x) for x in feature)
        csv_feature = csv_feature + "," + label
        arff_data.append(csv_feature)

    with open('data/sport.arff', 'w') as file:
        for item in arff_data:
            file.write("%s\n" % item)
def to_weka_arff(ngram, number_of_features):
  count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True)

  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  features = count_vect.fit_transform(tweet_list)

  features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list)
  print features.shape

  arff_data = []

  arff_data.append("@RELATION sport")

  for i in range(features.shape[1]):
    arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL")
  arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}")

  arff_data.append("@DATA")

  array_features = features.toarray()
  for i in range(len(array_features)):
    feature = array_features[i]
    label = label_list[i]
    csv_feature = ",".join(str(x) for x in feature)
    csv_feature = csv_feature + "," + label
    arff_data.append(csv_feature)

  with open('data/sport.arff', 'w') as file:
    for item in arff_data:
      file.write("%s\n" % item)
Beispiel #3
0
def find_and_save_timings():
  tweet_list = get_labelled_tweets()
  num_tweets = len(tweet_list)

  setup = """
from data_source import get_labelled_tweets, get_labels;
from sklearn.externals import joblib;
tweet_list = get_labelled_tweets();
# do transformation into vector;
vectoriser = joblib.load('model/tfidf_vectoriser.pkl');
vectorised_tweet_list = vectoriser.transform(tweet_list);
svm_model = joblib.load('model/tfidf_linsvc.pkl');
svm_model.predict(vectorised_tweet_list);
"""

  test_statement = 'svm_model.predict(vectorised_tweet_list)'
  REPETITIONS = 100

  # check timing of svm
  # time in micro seconds
  svm_time = timeit.timeit(stmt=test_statement, setup=setup, number=REPETITIONS)
  svm_time_dataset = get_dataset_time(svm_time, REPETITIONS)
  svm_time_record = get_record_time(svm_time_dataset, num_tweets)

  setup_ensemble = """
import cPickle;
from data_source import get_labelled_tweets;
from sklearn.externals import joblib;

tweet_list = get_labelled_tweets();
vectoriser = joblib.load('model/tfidf_vectoriser.pkl');
vectorised_tweet_list = vectoriser.transform(tweet_list);
with open('model/tfidf_ada.pickle', 'rb') as f:
    ensemble_model = cPickle.load(f);
ensemble_model.predict(vectorised_tweet_list);
"""

  test_statement_ensemble = 'ensemble_model.predict(vectorised_tweet_list)'
  ensemble_time = timeit.timeit(stmt=test_statement_ensemble,
                                setup=setup_ensemble,
                                number=REPETITIONS)
  ens_time_dataset = get_dataset_time(ensemble_time, REPETITIONS)
  ens_time_record = get_record_time(ens_time_dataset, num_tweets)

  # save results in a txt file
  create_directory('metric_result')
  with open("metric_result/" + 'timings' + ".txt", "w") as text_file:
    text_file.write("Number of records in dataset: {0}\n".format(num_tweets))
    text_file.write("Svm dataset time: {0}\n".format(svm_time_dataset))
    text_file.write("Svm record time: {0}\n".format(svm_time_record))
    text_file.write("Ensemble dataset time: {0}\n".format(ens_time_dataset))
    text_file.write("Ensemble record time: {0}\n".format(ens_time_record))
def gensim_classifier():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    label_list = get_labels()
    tweet_list = get_labelled_tweets()

    # split all sentences to list of words
    sentences = []
    for tweet in tweet_list:
        temp_doc = tweet.split()
        sentences.append(temp_doc)

    # parameters for model
    num_features = 100
    min_word_count = 1
    num_workers = 4
    context = 2
    downsampling = 1e-3

    # Initialize and train the model
    w2v_model = Word2Vec(sentences, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling, seed=1)

    index_value, train_set, test_set = train_test_split(0.80, sentences)
    train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
    test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
    train_vector = Imputer().fit_transform(train_vector)
    test_vector = Imputer().fit_transform(test_vector)

    # train model and predict
    model = LinearSVC()
    classifier_fitted = OneVsRestClassifier(model).fit(
        train_vector, label_list[:index_value])
    result = classifier_fitted.predict(test_vector)

    # output result to csv
    create_directory('data')
    result.tofile("data/w2v_linsvc.csv", sep=',')

    # store the model to mmap-able files
    create_directory('model')
    joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

    # evaluation
    label_score = classifier_fitted.decision_function(test_vector)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(label_list, classes=class_list)

    evaluate(binarise_result, binarise_labels[index_value:], label_score,
             'w2v_linsvc')
def gensim_classifier():
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  # split all sentences to list of words
  sentences = []
  for tweet in tweet_list:
    temp_doc = tweet.split()
    sentences.append(temp_doc)

  # parameters for model
  num_features = 100
  min_word_count = 1
  num_workers = 4
  context = 2
  downsampling = 1e-3

  # Initialize and train the model
  w2v_model = Word2Vec(sentences, workers=num_workers, \
              size=num_features, min_count = min_word_count, \
              window = context, sample = downsampling, seed=1)

  index_value, train_set, test_set = train_test_split(0.80, sentences)
  train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
  test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
  train_vector = Imputer().fit_transform(train_vector)
  test_vector = Imputer().fit_transform(test_vector)

  # train model and predict
  model = LinearSVC()
  classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
  result = classifier_fitted.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/w2v_linsvc.csv", sep=',')

  # store the model to mmap-able files
  create_directory('model')
  joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

  # evaluation
  label_score = classifier_fitted.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(label_list, classes=class_list)

  evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def lin_svc():
    label_list = get_labels()
    tweet_list = get_labelled_tweets()
    # vectorise using tf-idf
    vectoriser = TfidfVectorizer(
        min_df=3,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
    )

    ## do transformation into vector
    fitted_vectoriser = vectoriser.fit(tweet_list)
    vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
    train_vector, test_vector, train_labels, test_labels = train_test_split(
        vectorised_tweet_list, label_list, test_size=0.8, random_state=42)

    # train model and predict
    model = LinearSVC()
    ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
    result = ovr_classifier.predict(test_vector)

    # output result to csv
    create_directory('data')
    save_to_csv("data/testset_labels.csv", test_labels)
    result.tofile("data/tfidf_linsvc.csv", sep=',')

    save_model(ovr_classifier, 'tfidf_linsvc')
    save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

    # evaluation
    label_score = ovr_classifier.decision_function(test_vector)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(test_labels, classes=class_list)

    evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  fitted_vectoriser = vectoriser.fit(tweet_list)
  vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  # train model and predict
  model = LinearSVC()
  ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
  result = ovr_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  save_to_csv("data/testset_labels.csv", test_labels)
  result.tofile("data/tfidf_linsvc.csv", sep=',')

  save_model(ovr_classifier, 'tfidf_linsvc')
  save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

  # evaluation
  label_score = ovr_classifier.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)

  evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def ensemble_classify():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  vectoriser.fit(tweet_list)
  vectorised_tweet_list = vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  n_estimators = 10  # number of weak learners
  model = AdaBoostClassifier(n_estimators=n_estimators)
  ada_classifier = model.fit(train_vector, train_labels)
  result = ada_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/tfidf_ada.csv", sep=',')
  save_model(ada_classifier, 'tfidf_ada')

  # evaluation
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)
  generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def ensemble_classify():
    label_list = get_labels()
    tweet_list = get_labelled_tweets()
    # vectorise using tf-idf
    vectoriser = TfidfVectorizer(
        min_df=3,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
    )

    ## do transformation into vector
    vectoriser.fit(tweet_list)
    vectorised_tweet_list = vectoriser.transform(tweet_list)
    train_vector, test_vector, train_labels, test_labels = train_test_split(
        vectorised_tweet_list, label_list, test_size=0.8, random_state=42)

    n_estimators = 10  # number of weak learners
    model = AdaBoostClassifier(n_estimators=n_estimators)
    ada_classifier = model.fit(train_vector, train_labels)
    result = ada_classifier.predict(test_vector)

    # output result to csv
    create_directory('data')
    result.tofile("data/tfidf_ada.csv", sep=',')
    save_model(ada_classifier, 'tfidf_ada')

    # evaluation
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(test_labels, classes=class_list)
    generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)