Esempio n. 1
0
import cPickle as pickle
from labr import LABR
import numpy as np
import os
from qalsadi import analex
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble.forest import RandomForestClassifier

PATH = './'
file_name = PATH + "experiments_acl2013.txt"

# data sets
gr = LABR()
datas = [
    dict(name="2-balanced", params=dict(klass="2", balanced="balanced")),
    dict(name="2-unbalanced", params=dict(klass="2", balanced="unbalanced")),
    dict(name="5-balanced", params=dict(klass="5", balanced="balanced")),
    dict(name="5-unbalanced", params=dict(klass="5", balanced="unbalanced"))
]

# tokenizer
an = analex()
tokenizer = an.text_tokenize

# features
features = [
    dict(name="count_ng1",
         feat=CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 1))),
                                                    (x == "Positive") else x)
df_AJGT["text"] = df_AJGT["text"].progress_apply(lambda x: preprocess(
    x, do_farasa_tokenization=True, farasa=farasa_segmenter, use_farasapy=True)
                                                 )
train_AJGT, test_AJGT = train_test_split(df_AJGT,
                                         test_size=0.2,
                                         random_state=42)
label_list_AJGT = [0, 1]

data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
all_datasets.append(data_AJGT)
#%%
# *************LABR-UN-Binary************
from labr import LABR

labr_helper = LABR()

(d_train, y_train, d_test,
 y_test) = labr_helper.get_train_test(klass="2", balanced="unbalanced")

train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})

train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
    lambda x: preprocess(x,
                         do_farasa_tokenization=True,
                         farasa=farasa_segmenter,
                         use_farasapy=True))
test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
    lambda x: preprocess(x,
                         do_farasa_tokenization=True,
Esempio n. 3
0
from __future__ import print_function
import cPickle as pickle
from labr import LABR
import numpy as np
import os
from qalsadi import analex
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble.forest import RandomForestClassifier


# main object that will read the train/test sets from files.
labr = LABR()


# data sets: different settings:
#   * balanced/unbalanced
#   * 2-class or 5-class
datas = [
        dict(name="2-balanced", params=dict(klass="2", balanced="balanced")),
        dict(name="2-unbalanced",
             params=dict(klass="2", balanced="unbalanced")),
        dict(name="5-balanced", params=dict(klass="5", balanced="balanced")),
        dict(name="5-unbalanced",
             params=dict(klass="5", balanced="unbalanced"))]

# tokenizer. This requires the Qalsadi python package.
an = analex()