Esempio n. 1
0
def create_classifier(config):

    return Classifier(model_configuration={
        'id': id(),
        "type": config["type"],
        "class_weight": None if config['weighting'].lower() == 'none' else config['weighting'].lower(),
        "tokenizer": BaseTokenizer() if config["tokenizer"] == "WORD_TOKENIZER" \
            else PorterTokenizer() if config["tokenizer"] == "STEMMER" \
            else LemmaTokenizer()  if config["tokenizer"] == "LEMMATIZER" \
            else None,
        "ngram_range": (1, 1) if config["ngrams"] == "UNIGRAM" \
            else (2, 2) if config["ngrams"] == "BIGRAM" \
            else (1, 2) if config["ngrams"] == "BOTH" \
            else None,
        "sublinear_tf": config["tf"] == "SUBLINEAR",
        "smooth_idf": config["df"] == "SMOOTH",
        "penalty": config['penalty'].lower(),
        "multi_class": config['multiclass'].lower(),
        "solver": config['solver'].lower(),
        "dual": config['primal_dual']=='DUAL',
        "fit_intercept": config['fitIntercept'],
        'max_df': config['max_df'],
        'min_df': config['min_df'],
        'stopwords': ENGLISH_STOP_WORDS if config["stopwords"] == "ENGLISH" else [],
        'C': config['C'],
        'max_iter': config['max_iter']
    })
Esempio n. 2
0
def top_rfe_features(labeled_dataset, config, topN = None):
    labeled_inp_df = datasetToDataframe(labeled_dataset)
    features = defaultFeatures(dataset=labeled_dataset)
    featurizers = defaultFeaturizers(features)

    stop_words = ENGLISH_STOP_WORDS if config["stopwords"] == "ENGLISH" else []

    tokenizer = BaseTokenizer() if config["tokenizer"] == "WORD_TOKENIZER" \
        else PorterTokenizer() if config["tokenizer"] == "STEMMER" \
        else LemmaTokenizer()  if config["tokenizer"] == "LEMMATIZER" \
        else None

    ngram_range = (1, 1) if config["ngrams"] == "UNIGRAM" \
        else (2, 2) if config["ngrams"] == "BIGRAM" \
        else (1, 2) if config["ngrams"] == "BOTH" \
        else None


    ac = Classifier(model_configuration={
        "type": config['type'],
        "class_weight": config['weighting'].lower(),
        "tokenizer": tokenizer,
        "ngram_range": ngram_range,
        "sublinear_tf": config['tf']=="SUBLINEAR",
        "smooth_idf": config['df']=="SMOOTH",
        "penalty": config['penalty'].lower(),
        "multi_class": config['multiclass'].lower(),
        "solver": config['solver'].lower(),
        "dual": config['primal_dual']=="DUAL",
        "fit_intercept": config['fitIntercept'],
        'max_df':  config['max_df'],
        'min_df':  config['min_df'],
        'stopwords': stop_words,
        'C': config['C'],
        'max_iter': config['max_iter']
    })

    res_df = ac.feature_ranking(input_df=labeled_inp_df, schema=featurizers, mode=Classifier.CC_fs_backward)

    feature_names = pd.Series(map(lambda fname: fname.split('::')[0], res_df['Feature']))
    feature_scores = pd.concat([feature_names, res_df['Score']], axis=1)
    feature_scores.columns = ['Feature', 'Score']
    feature_sum_scores = feature_scores.groupby('Feature').sum()
    sorted_features = feature_sum_scores.sort_values(by = ["Score"], ascending = False)

    selected_feature_names = list(sorted_features.index)[:topN]
    selected_features = []
    for fname in selected_feature_names:
        selected_features  += [feat for feat in features if feat['name'] == fname]

    return selected_features+ [features[-1]]
Esempio n. 3
0
class PorterTokenizer(object):
    def __init__(self):
        self.__wnl = PorterStemmer()
        self.__basetokenizer = BaseTokenizer()

    def __call__(self, doc):
        return self.tokenize(doc)

    def tokenize(self, doc):
        return [self.__wnl.stem(t) for t in self.__basetokenizer.tokenize(doc)]

    def __str__(self):
        return '''
        Porter tokenizer based on 
        %s
        ''' % self.__basetokenizer
Esempio n. 4
0
class LemmaTokenizer(object):
    def __init__(self):
        self.__wnl = WordNetLemmatizer()
        self.__basetokenizer = BaseTokenizer()


    def __call__(self, doc):
        return self.tokenize(doc)


    def tokenize(self, doc):
        return [self.__wnl.lemmatize(t) for t in self.__basetokenizer.tokenize(doc)]


    def __str__(self):
        return '''
        WordNet Lemmatizer based on
        %s
        ''' % self.__basetokenizer
Esempio n. 5
0
 def __init__(self):
     self.__wnl = PorterStemmer()
     self.__basetokenizer = BaseTokenizer()
Esempio n. 6
0
import random as rd

from app.core.main.classifier.LR import LR
from app.core.main.classifier.LSVC import LSVC
from app.core.main.classifier.Ensemble import Ensemble
from app.core.main.featurizer.Featurizer import Featurizer

from app.core.main.tokenizer.BaseTokenizer import BaseTokenizer
from sklearn.feature_extraction import stop_words

from app.core.main.Classifier import Classifier

lrModelConfiguration = {
    "type": "LOGISTIC_REGRESSION",
    "class_weight": "balanced",
    "tokenizer": BaseTokenizer(),
    "ngram_range": (1, 1),
    "sublinear_tf": True,
    "smooth_idf": True,
    "penalty": "l2",
    "multi_class": "ovr",
    "solver": "liblinear",
    "dual": True,
    "fit_intercept": True,
    'max_df': 1.,
    'min_df': 0.,
    'stopwords': stop_words.ENGLISH_STOP_WORDS,
    'C': 1.,
    'max_iter': 1000,
}
Esempio n. 7
0
 def __init__(self):
     self.__wnl = WordNetLemmatizer()
     self.__basetokenizer = BaseTokenizer()