import cPickle as pickle from labr import LABR import numpy as np import os from qalsadi import analex from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.svm import LinearSVC from sklearn import metrics from sklearn.ensemble.forest import RandomForestClassifier PATH = './' file_name = PATH + "experiments_acl2013.txt" # data sets gr = LABR() datas = [ dict(name="2-balanced", params=dict(klass="2", balanced="balanced")), dict(name="2-unbalanced", params=dict(klass="2", balanced="unbalanced")), dict(name="5-balanced", params=dict(klass="5", balanced="balanced")), dict(name="5-unbalanced", params=dict(klass="5", balanced="unbalanced")) ] # tokenizer an = analex() tokenizer = an.text_tokenize # features features = [ dict(name="count_ng1", feat=CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 1))),
(x == "Positive") else x) df_AJGT["text"] = df_AJGT["text"].progress_apply(lambda x: preprocess( x, do_farasa_tokenization=True, farasa=farasa_segmenter, use_farasapy=True) ) train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42) label_list_AJGT = [0, 1] data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT) all_datasets.append(data_AJGT) #%% # *************LABR-UN-Binary************ from labr import LABR labr_helper = LABR() (d_train, y_train, d_test, y_test) = labr_helper.get_train_test(klass="2", balanced="unbalanced") train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train}) test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test}) train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply( lambda x: preprocess(x, do_farasa_tokenization=True, farasa=farasa_segmenter, use_farasapy=True)) test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply( lambda x: preprocess(x, do_farasa_tokenization=True,
from __future__ import print_function import cPickle as pickle from labr import LABR import numpy as np import os from qalsadi import analex from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.svm import LinearSVC from sklearn import metrics from sklearn.ensemble.forest import RandomForestClassifier # main object that will read the train/test sets from files. labr = LABR() # data sets: different settings: # * balanced/unbalanced # * 2-class or 5-class datas = [ dict(name="2-balanced", params=dict(klass="2", balanced="balanced")), dict(name="2-unbalanced", params=dict(klass="2", balanced="unbalanced")), dict(name="5-balanced", params=dict(klass="5", balanced="balanced")), dict(name="5-unbalanced", params=dict(klass="5", balanced="unbalanced"))] # tokenizer. This requires the Qalsadi python package. an = analex()