Esempio n. 1
1
def test_tfidf_vectorizer():
    '''
    停用词就是在分类中没有用的词,这些词一般词频 TF 高,但是 IDF 很低,起不到分类的作用。
    为了节省空间和计算时间,我们把这些词作为停用词 stop words,告诉机器这些词不需要帮我计算
    TfidfVectorizer
    stop_words list
    token_pattern 过滤规则 正则表达式
    fit_transform后
    vocabulary_ 词汇表 字典型
    idf_ 返回idf值
    stop_words_ 返回停用词表
    :return:
    '''
    tfidf_vec = TfidfVectorizer()
    print(tfidf_vec)
    documents = [
        'this is the bayes document',
        'this is the second document',
        'and the third one',
        'is this the document'
    ]
    tfidf_matrix = tfidf_vec.fit_transform(documents)
    print(tfidf_vec.get_feature_names())
    print(tfidf_vec.get_stop_words())
    print(tfidf_vec.get_params())
    print(tfidf_vec.vocabulary_)
    print(tfidf_matrix.toarray())
Esempio n. 2
0
def document_tfid_parser(documents):
    # So we want to parse one single document

    # Het werkt niet met alle documenten
    # vectorizer = TfidfVectorizer()
    # X = vectorizer.fit_transform(documents)
    # print(vectorizer.get_feature_names())
    # print(vectorizer.get_params())
    # print(vectorizer.get_stop_words())
    # print(X)
    # print(X.shape)
    sumOfDocuments = []
    for document in documents:
        sumOfDocuments.append(str(document[0]).replace('_',' '))
        # if len(document[0]) > 4:
        #     vectoriser = TfidfVectorizer()
        #     X = vectoriser.fit_transform(document)
        #     print(vectoriser.get_feature_names())
        #     print(vectoriser.get_params())
        #     print(vectoriser.get_stop_words())
        #     print(X.shape)
        #     print(X)
        # else:
        #     pass
    if len(sumOfDocuments) > 4:
        vectoriser = TfidfVectorizer(max_df=0.7)
        X = vectoriser.fit_transform(sumOfDocuments)
        print(vectoriser.get_feature_names())
        print(vectoriser.get_params())
        print(vectoriser.get_stop_words())
        print(X.shape)
        print(X)
    else:
        pass
    def get_data_with_dandelion(self, relevance_threshold=0.75, min_df=2,
                              gamma=0.89, filter=False):
        only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold)
        entities_sparse = sparse.csr_matrix(ent)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=min_df,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
                                                  tfidf_matrix.shape[1])
        print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0],
                                                     entities_sparse.shape[1])
        print 'non zero elements in entities matrix: %s' \
              % len(entities_sparse.data)

        '''print tfidf_matrix[tfidf_matrix > 0].mean()
        print tfidf_matrix[tfidf_matrix > 0].max()

        print entities_sparse[entities_sparse > 0].mean()
        print entities_sparse[entities_sparse > 0].max()
        print '#' * 80'''
        #print 'after balancing'

        tfidf_matrix = tfidf_matrix * 1
        entities_sparse = entities_sparse * (1 - gamma)

        #print tfidf_matrix[tfidf_matrix > 0].mean()
        #print tfidf_matrix[tfidf_matrix > 0].max()

        #print entities_sparse[entities_sparse > 0].mean()
        #print entities_sparse[entities_sparse > 0].max()

        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['dandelion_entities'] = entities_sparse.shape[1]
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['relevance_threshold'] = relevance_threshold
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'
        del params['dtype']

        params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()

        return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict, params
class WrappedVectorizer:
    def __init__(self, sanitizer=None, sg_only=False, *args, **kwargs):
        self.sg_only = sg_only
        self.sanitizer = sanitizer
        self.vectorizer = TfidfVectorizer(*args, **kwargs)

    def fit(self, data, labels=None):
        if self.sg_only:
            if labels is None:
                raise Exception('fit: Labels cannot be None if sg_only=True')
            else:
                data = np.array(data)[np.array(labels) == 4]
                # print("fitting using %d data" % len(data))
        if self.sanitizer is not None:
            data = self.sanitizer(data)

        self.vectorizer.fit(data)

    def transform(self, data):
        if self.sanitizer is not None: data = self.sanitizer(data)
        return self.vectorizer.transform(data)

    def fit_transform(self, data, labels):
        self.fit(data, labels)
        return self.transform(data)

    def set_params(self, **parameters):
        # treat our params
        for key in ['sg_only', 'sanitizer']:
            if key in parameters:
                setattr(self, key, parameters[key])
                del parameters[key]
        # forward the remaining to the scikit vectorizer
        self.vectorizer.set_params(**parameters)
        # don't forget to return self
        # see https://stackoverflow.com/questions/28124366/can-gridsearchcv-be-used-with-a-custom-classifier
        return self

    def get_params(self, deep=True):
        if deep:
            return dict(**dict(sg_only=self.sg_only, sanitizer=self.sanitizer),
                        **self.vectorizer.get_params())
        else:
            return dict(sg_only=self.sg_only, sanitizer=self.sanitizer)

    def __repr__(self):
        return "WrappedVectorizer(%s)" % ", ".join(
            ["%s=%r" % t for t in self.get_params().items()])
Esempio n. 5
0
class ColNormedTfidf(TransformerMixin):
    """
    Model that derives tf-idf reweighted representations of utterances,
    which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfTransformer` transformer; see documentation of that transformer for further details.
    """
    def __init__(self, **kwargs):
        if 'token_pattern' in kwargs:
            self.tfidf_model = TfidfVectorizer(**kwargs)
        else:
            self.tfidf_model = TfidfVectorizer(token_pattern=r'(?u)(\S+)',
                                               **kwargs)

    def fit(self, X, y=None):
        tfidf_vects_raw = self.tfidf_model.fit_transform(X)
        self.col_norms = sparse.linalg.norm(tfidf_vects_raw, axis=0)

    def transform(self, X):
        tfidf_vects_raw = self.tfidf_model.transform(X)
        tfidf_vect = tfidf_vects_raw / self.col_norms
        return tfidf_vect

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

    def get_feature_names(self):
        return self.tfidf_model.get_feature_names()

    def get_params(self, deep=True):
        return self.tfidf_model.get_params(deep=deep)

    def set_params(self, **params):
        return self.tfidf_model.set_params(**params)

    def load(self, dirname):
        self.tfidf_model = joblib.load(
            os.path.join(dirname, 'tfidf_model.joblib'))
        self.col_norms = np.load(os.path.join(dirname, 'tfidf_col_norms.npy'))

    def dump(self, dirname):
        try:
            os.mkdir(dirname)
        except:
            pass
        np.save(os.path.join(dirname, 'tfidf_col_norms.npy'), self.col_norms)
        joblib.dump(self.tfidf_model,
                    os.path.join(dirname, 'tfidf_model.joblib'))
Esempio n. 6
0
def create_model(x_train, y_train, x_test, y_test):
    """ Create a trained model using the best parameters. """

    print("\nCREATING FINAL MODEL...")

    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.9)
    print("vectorizer params:", vectorizer.get_params())

    linear_svc = svm.LinearSVC(C=1.0, dual=True, loss="hinge", penalty="l2")
    print("linear svc params", linear_svc.get_params())

    linear_svc_pipeline = Pipeline(
        steps=[("vectorizer", vectorizer), ("linear_svc", linear_svc)])

    print("\nTRAINING FINAL MODEL...")
    linear_svc_pipeline.fit(x_train, y_train)

    print("\nPICKLING MODEL...")
    list_pickle = open("final_model/trained_linear_svc.pkl", "wb")
    pickle.dump(linear_svc_pipeline, list_pickle)
    list_pickle.close()

    print("\nUNPICKLING MODEL...")
    list_unpickle = open("final_model/trained_linear_svc.pkl", "rb")
    model = pickle.load(list_unpickle)
    list_unpickle.close()

    print("Detaiiled classification report for final model:")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    y_true, y_pred = y_test, model.predict(x_test)

    print(model.score(x_test, y_test))
    print(model.get_params)

    print(
        classification_report(y_true,
                              y_pred,
                              target_names=["negative", "neutral",
                                            "positive"]))
    print()

    print("Confusion matrix for final model:")
    print(confusion_matrix(y_true, y_pred))
    print()
    print()
def tfidf(eventgroup_id, use_full, use_glove, overwrite):
    fname = f'data/representations/representation_tf-idf_{eventgroup_id}.pkl'
    if use_full:
        fname = f'data/representations/representation_tf-idf_{eventgroup_id}_full.pkl'

    path = Path(fname)
    if path.exists() and not overwrite:
        logger.info(f"file {path.as_posix()} exists")
        return

    logger.info(f"loading documents (full={use_full})")
    docs = docs_cache.get(eventgroup_id, use_full)
    total_docs = len(docs)

    token_sets = []
    doc_ids = []
    for doc_id, texts in tqdm(docs.items(),
                              total=total_docs,
                              desc="tokenizing docs"):
        doc = []

        for token in tokenizer(' '.join([d.text for d in texts])):
            doc.append(token)

        if doc:
            token_sets.append(doc)
            doc_ids.append(doc_id)

    logger.info("applying tf-idf")
    vectorizer = TfidfVectorizer(tokenizer=identity,
                                 preprocessor=identity,
                                 dtype=np.float32)

    m = vectorizer.fit_transform(token_sets)

    logger.info("saving matrix")
    params = vectorizer.get_params()
    params.pop('preprocessor')
    params.pop('tokenizer')
    params.pop('dtype')
    params['name'] = 'tf-idf'

    joblib.dump((m, doc_ids, params), fname)
Esempio n. 8
0
class TfIdfEncoder(Preprocessor):
    """Wrapper around tf-idf providing Preprocessor interface."""

    def __init__(self, params=None):
        """Initialize TfIdfTermEncoder."""
        if params is None:
            params = {}
        self.model = TfidfVectorizer(**params)

    def info(self):
        """Get model info."""
        return self.model.get_params()

    def fit(self, data):
        """Fit the model."""
        self.model.fit(data)

    def transform(self, data):
        """Transform the input data."""
        return np.array(self.model.transform(data).todense())
    def get_data_only_with_abstract(self, relevance_threshold=0.75, min_df=0.01,
                              gamma=0.89, filter=False):
        only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold)
        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=min_df,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['relevance_threshold'] = relevance_threshold
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'

        return tfidf_matrix, f_score_dict, params
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

corpus[3]

tfidf_v= TfidfVectorizer(max_features=2500, ngram_range=(1,3))
X =tfidf_v.fit_transform(corpus).toarray()

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

tfidf_v.get_feature_names()[1:20]
tfidf_v.get_params()

data = pd.DataFrame(X_train,columns = tfidf_v.get_feature_names())

import numpy as np
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
Esempio n. 11
0
        
    #20 newsgroups (part of sklearn)
    print "loading 20 newsgroups dataset..."
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    toc = time()
    print "elapsed time: %.4f sec" %(toc - tic)    
    
    #compute tf-idf (equivalent to count-vectorizer followed by tf-idf transformer)
    #count-vectorizer produces term-document matrix tf-idf scales tf counts by log N/nt 
    #(N:num of docs, nt: number of a word occurence in docs)
    #if float (proportion of docs): min_df < nt/N  < max_df, if int: refers to count nt, e.g. min_df = 2
    tfidf = TfidfVectorizer(max_features = num_features, max_df=0.95, min_df=2, stop_words = 'english')
    print "tfidf parameters:"
    print tfidf.get_params()    
        
    #generate tf-idf term-document matrix
    A_tfidf_sp = tfidf.fit_transform(train_corpus)  #size D x V
    
    print "number of docs: %d" %A_tfidf_sp.shape[0]
    print "dictionary size: %d" %A_tfidf_sp.shape[1]

    #tf-idf dictionary    
    tfidf_dict = tfidf.get_feature_names()
             
    #fit LDA model
    print "Fitting LDA model..."
    lda_vb = LatentDirichletAllocation(n_topics = num_topics, max_iter=10, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)

    tic = time()
# In[25]:


y_test.shape


# In[25]:


tv.get_feature_names()[:20]  # Top 20 feature names for this data set #, which shows 2 words and 3 words togather #


# In[26]:


tv.get_params()  # will give details for the count vectorizer applied # 


# In[28]:


# Data set after applying tyhe count vectporizer #

df_count = pd.DataFrame(X,columns=tv.get_feature_names())
df_count.head(10)


# In[29]:


# Applying the Multionomial NB algorithm #
Esempio n. 13
0
                                 random_state=0,
                                 remove=('headers', 'footers', 'quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    toc = time()
    print "elapsed time: %.4f sec" % (toc - tic)

    #compute tf-idf (equivalent to count-vectorizer followed by tf-idf transformer)
    #count-vectorizer produces term-document matrix tf-idf scales tf counts by log N/nt
    #(N:num of docs, nt: number of a word occurence in docs)
    #if float (proportion of docs): min_df < nt/N  < max_df, if int: refers to count nt, e.g. min_df = 2
    tfidf = TfidfVectorizer(max_features=num_features,
                            max_df=0.95,
                            min_df=2,
                            stop_words='english')
    print "tfidf parameters:"
    print tfidf.get_params()

    #generate tf-idf term-document matrix
    A_tfidf_sp = tfidf.fit_transform(train_corpus)  #size D x V

    print "number of docs: %d" % A_tfidf_sp.shape[0]
    print "dictionary size: %d" % A_tfidf_sp.shape[1]

    #tf-idf dictionary
    tfidf_dict = tfidf.get_feature_names()

    #fit LDA model
    print "Fitting LDA model..."
    lda_vb = LatentDirichletAllocation(n_topics=num_topics,
                                       max_iter=10,
                                       learning_method='online',
Esempio n. 14
0
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from tinydb import TinyDB

my_module = importlib.import_module('sklearn.feature_extraction.text')
my_class = getattr(my_module, 'CountVectorizer')
cv = my_class()
print("Module name: {}".format(type(cv).__name__))
models = [MultinomialNB(), RandomForestClassifier(), SVC(), XGBClassifier()]
params = {}
tfidf = TfidfVectorizer()
tfidf_params = {type(tfidf).__name__ + '__' + k:v for k,v in tfidf.get_params().items() if not inspect.isclass(v)}

def dumper(obj):
    try: 
        return obj.toJSON()
    except:
        return
   
v0 = TinyDB('v0.json')
if len(v0.tables()) > 0:
    v0.purge_tables()
for model in models:
    table = v0.table(type(model).__name__)
    params = {}
    params['model_params'] = {}
    params['model_params'] = {type(model).__name__ + '__' + k:v for k,v in model.get_params().items() if not inspect.isclass(v)}
Esempio n. 15
0
# generate dict{'word': score} with tf-idf
corpus = [
    '無料 ごはん おかず ごはん おかず ディナー クーポン クーポン 食事',
    '無料 ごはん おかず ごはん おかず ランチ  クーポン クーポン 食事 昼飯', ''
]

# tf : 1doc(=corpus[i])で計算
# idf: all docs(=corpus)で計算
# つまり
# corpusの1要素を、1ユーザーの発言にすれば、
# 全ドキュメント=全ユーザーの発言になり、
# 全ユーザーに共通発言されているものはスコアが低くなる

tfidf_vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b',
                                   max_features=3000)
feature_matrix = tfidf_vectorizer.fit_transform(corpus)
print('feature_matrix >> ')
print(feature_matrix)
feature_matrix_arr = feature_matrix.toarray()
print('feature_matrix_arr >> ')
print(feature_matrix_arr)

print('tfidf_vectorizer.vocabulary_ >> ')
print(tfidf_vectorizer.vocabulary_)

print('tfidf_vectorizer.get_feature_names() >> ')
print(tfidf_vectorizer.get_feature_names())

print('tfidf_vectorizer.get_params() >> ')
print(tfidf_vectorizer.get_params())
class FeatureBased(object):
    def __init__(self,
                 train_path=None,
                 valid_path=None,
                 test_path=None,
                 model='multinomial',
                 feat='bow'):

        self.train_path = train_path

        self.valid_path = valid_path

        self.test_path = test_path

        logger.info('train_path: %s' % train_path)
        logger.info('valid_path: %s' % valid_path)
        logger.info('test_path: %s' % test_path)

        # define the model
        if model == 'multinomial':
            self.model = naive_bayes.MultinomialNB()

            self.model_name = 'multinomial'

        elif model == 'linreg':

            self.model = linear_model.LogisticRegression(
                random_state=1234,
                solver='lbfgs',
                multi_class='multinomial',
                penalty='l2')

            self.model_name = 'linear_model.LogisticRegression'

        elif model == 'linsvm':

            self.model = svm.LinearSVC(random_state=1234,
                                       tol=1e-5,
                                       penalty='l2')

            self.model_name = 'svm.LinearSVC'
        else:

            raise NotImplemetedError()

        if feat == 'bow':

            self.vectorizer = CountVectorizer(tokenizer=self.tokenizeText,
                                              ngram_range=(1, 1))

            self.feat_name = 'bag_of_words'

        elif feat == 'uni-bi-gram':

            self.vectorizer = CountVectorizer(tokenizer=self.tokenizeText,
                                              ngram_range=(1, 2))

            self.feat_name = 'uni_bi_grams'

        elif feat == 'tf':

            self.vectorizer = TfidfVectorizer(use_idf=False)

            self.feat_name = 'tf'

        elif feat == 'tfidf':

            self.vectorizer = TfidfVectorizer()

            self.feat_name = 'tfidf'

        else:

            raise NotImplemetedError()

        logger.info('feat name: %s' % self.feat_name)

        logger.info('feat vectorizer parameters: %s' %
                    self.vectorizer.get_params())

        logger.info('model name: %s' % self.model_name)

        logger.info('model parameters: %s' % self.model.get_params())

    def prepare_data(self):

        train_x, train_y = self.load_data(self.train_path)
        logger.info('train data is loaded. #samples: %d, #labels:%d' %
                    (len(train_x), len(train_y)))
        train_feat_vecs = self.text_to_features(train_x, is_trainset=True)
        logger.info('train_feat_vecs: %s' % str(train_feat_vecs.shape))
        self.features = self.vectorizer.vocabulary_  # voc={word:id (feature_index)}
        logger.info('number of features: %d' % len(self.features))
        train_label = self.text_to_label(train_y)
        self.train_data = (train_feat_vecs, train_label)

        valid_x, valid_y = self.load_data(self.valid_path)
        logger.info('valid data is loaded. #samples: %d, #labels:%d' %
                    (len(valid_x), len(valid_y)))
        valid_feat_vecs = self.text_to_features(valid_x)
        logger.info('valid_feat_vecs: %s' % str(valid_feat_vecs.shape))
        valid_label = self.text_to_label(valid_y)
        self.valid_data = (valid_feat_vecs, valid_label)

        test_x, test_y = self.load_data(self.test_path)
        logger.info('test data is loaded. #samples: %d, #labels:%d' %
                    (len(test_x), len(test_y)))
        test_feat_vecs = self.text_to_features(test_x)
        logger.info('test_feat_vecs: %s' % str(test_feat_vecs.shape))
        test_label = self.text_to_label(test_y)
        self.test_data = (test_feat_vecs, test_label)

    def load_data(self, data_path):

        with open(data_path, 'r') as f:

            lines = f.read().strip().split('\n')

        data_x = []

        data_y = []

        for line in lines:

            if len(line) > 0:

                act, utt = line.split(' ', 1)

                act = act.strip()

                utt = utt.strip()

                data_y.append(act)

                data_x.append(utt)

        return (data_x, data_y)

    def train(self):
        '''
		train the model on the training data
		'''
        self.model = self.model.fit(self.train_data[0], self.train_data[1])

    def eval(self):
        '''
		evaluate the model on the test data
		'''
        train_pred = self.model.predict(self.train_data[0])

        train_acc, train_f1 = self.metric(pred=train_pred,
                                          gold=self.train_data[1])

        valid_pred = self.model.predict(self.valid_data[0])

        valid_acc, valid_f1 = self.metric(pred=valid_pred,
                                          gold=self.valid_data[1])

        test_pred = self.model.predict(self.test_data[0])

        test_acc, test_f1 = self.metric(pred=test_pred, gold=self.test_data[1])

        logger.info(
            'train: (acc = %.2f%%, f1 = %.2f), valid: (acc = %.2f%%, f1 = %.2f) test: (acc = %.2f%%, f1 = %.2f)'
            % (train_acc, train_f1, valid_acc, valid_f1, test_acc, test_f1))

    def metric(self, pred, gold):

        acc = metrics.accuracy_score(gold, pred) * 100

        f1 = metrics.f1_score(gold, pred, average='weighted') * 100

        return acc, f1

    def predict(self, list_texts):

        feat_vecs = self.text_to_features(list_texts)

        label_pred = self.model.predict(feat_vecs)

        labels = {1: 'inform', 2: 'question', 3: 'directive', 4: 'commissive'}

        label_pred_string = [labels[l + 1] for l in label_pred]

        return label_pred, label_pred_string

    def text_to_label(self, data_y):

        labels = [int(label) - 1 for label in data_y]

        return labels

    def tokenizeText(self, sample):

        tokens = sample.split(' ')

        tokens = [token.lower().strip() for token in tokens if len(token) > 0]

        return tokens

    def text_to_features(self, data_x, is_trainset=False):
        '''
			data: is a list of texts  
		'''
        feature_vectors = []

        # bag of words
        if is_trainset:

            feature_vectors = self.vectorizer.fit_transform(data_x)

        else:

            feature_vectors = self.vectorizer.transform(data_x)

        return feature_vectors

    def save(self, model_path):

        model_path = model_path + '_' + self.model_name + '_' + self.feat_name

        model_vect_path = model_path + '_vectorizer'

        model_path += '.mdl'

        model_vect_path += '.mdl'

        with open(model_path, 'wb') as file:

            pickle.dump(self.model, file)

        with open(model_vect_path, 'wb') as file:

            pickle.dump(self.vectorizer, file)

        logger.info('model saved: %s' % model_path)
        logger.info('vectorizer saved: %s' % model_vect_path)

    def load(self, model_path):

        model_path = model_path + '_' + self.model_name + '_' + self.feat_name

        model_vect_path = model_path + '_vectorizer'

        model_path += '.mdl'

        model_vect_path += '.mdl'

        with open(model_path, 'rb') as file:

            self.model = pickle.load(file)

        with open(model_vect_path, 'rb') as file:

            self.vectorizer = pickle.load(file)

        logger.info('model loaded: %s' % model_path)
        logger.info('vectorizer loaded: %s' % model_vect_path)
Esempio n. 17
0
def extract_features_age(docs_train, docs_val, docs_test, lsa=True):
    """Extract features

    This is basically a duplicate of the *extract_features_gender()* function, except that it does not use the
    PAN18AP test corpus as a second test set.
    """

    # Build a vectorizer that splits strings into sequences of 1 to 3 words
    word_vectorizer = TfidfVectorizer(preprocessor=None,
                                      analyzer='word',
                                      ngram_range=(1, 3),
                                      max_features=10**5,
                                      min_df=2,
                                      use_idf=True,
                                      sublinear_tf=True)
    # Build a vectorizer that splits strings into sequences of 3 to 5 characters
    char_vectorizer = TfidfVectorizer(preprocessor=None,
                                      analyzer='char',
                                      ngram_range=(3, 5),
                                      max_features=10**5,
                                      min_df=2,
                                      use_idf=True,
                                      sublinear_tf=True)

    # Log the parameters of the word and character vectorizers
    logger.info('word_vectorizer: %s', word_vectorizer.get_params())
    logger.info('char_vectorizer: %s', char_vectorizer.get_params())

    # Build a transformer (vectorizer) pipeline using the previous analyzers
    # *FeatureUnion* concatenates results of multiple transformer objects
    ngrams_vectorizer = Pipeline([
        ('feats',
         FeatureUnion([('word_ngram', word_vectorizer),
                       ('char_ngram', char_vectorizer)]))
    ])

    # Fit (learn vocabulary and IDF) and transform (transform documents to the TF-IDF matrix) the training set
    x_train_ngrams_tfidf = ngrams_vectorizer.fit_transform(docs_train)
    '''
    ↳ Check the following attributes of each of the transformers (analyzers)—*word_vectorizer* and *char_vectorizer*:
    vocabulary_ : dict. A mapping of terms to feature indices.
    stop_words_ : set. Terms that were ignored
    '''
    logger.info(
        '@ %.2f seconds: Finished fit_transforming the training dataset',
        time.process_time())

    feature_names_ngrams = [
        word_vectorizer.vocabulary_, char_vectorizer.vocabulary_
    ]
    logger.info('Size of vocabulary: %s words | %s characters',
                format(len(word_vectorizer.vocabulary_), ',d'),
                format(len(char_vectorizer.vocabulary_), ',d'))

    # Vectorize each validation/test set
    # Extract the features of the validation/test sets (transform test documents to the TF-IDF matrix)
    # Only transform is called on the transformer (vectorizer), because it has already been fit to the training set.
    x_val_ngrams_tfidf = ngrams_vectorizer.transform(docs_val)
    logger.info('@ %.2f seconds: Finished transforming the validation set',
                time.process_time())
    x_test_ngrams_tfidf = ngrams_vectorizer.transform(docs_test)
    logger.info('@ %.2f seconds: Finished transforming the test set',
                time.process_time())

    logger.info(
        'Word & character ngrams .shape = {training: %s | validation: %s | test: %s}',
        x_train_ngrams_tfidf.shape, x_val_ngrams_tfidf.shape,
        x_test_ngrams_tfidf.shape)

    # • Dimensionality reduction using truncated SVD (aka LSA)
    if lsa:
        # Build a truncated SVD (LSA) transformer object
        svd = TruncatedSVD(n_components=300, random_state=43)
        # Fit the LSA model and perform dimensionality reduction on the training set
        x_train_ngrams_tfidf_reduced = svd.fit_transform(x_train_ngrams_tfidf)
        logger.info(
            '@ %.2f seconds: Finished dimensionality reduction (LSA) on the training set',
            time.process_time())
        # Perform dimensionality reduction on the validation and test sets
        # Note that the SVD (LSA) transformer is already fit on the training set
        x_val_ngrams_tfidf_reduced = svd.transform(x_val_ngrams_tfidf)
        logger.info(
            '@ %.2f seconds: Finished dimensionality reduction (LSA) on the validation set',
            time.process_time())
        x_test_ngrams_tfidf_reduced = svd.transform(x_test_ngrams_tfidf)
        logger.info(
            '@ %.2f seconds: Finished dimensionality reduction (LSA) on the test set',
            time.process_time())

        x_train = x_train_ngrams_tfidf_reduced
        x_val = x_val_ngrams_tfidf_reduced
        x_test = x_test_ngrams_tfidf_reduced
    else:
        x_train = x_train_ngrams_tfidf
        x_val = x_val_ngrams_tfidf
        x_test = x_test_ngrams_tfidf

    return x_train, x_val, x_test, feature_names_ngrams
    def get_data_fabio(self, gamma=0.89, rank_metric='r'):
        data = self.mongo.get_all(order_by='id_doc')

        data = [doc for doc in data]
        only_text = [doc['text'] for doc in data]

        entitySet = set()
        for d in data:
            if 'isa' in d:
                for e in d['isa']:
                    entitySet.add(e['entity'])

        current = np.zeros((len(data), len(entitySet)), dtype=np.float)
        count = 0
        invIndex = {}
        countFeatures = 0
        for i,d in enumerate(data):
            if 'isa' in d:
                for f in d['isa']:
                    if f['entity'] not in invIndex:
                       invIndex[f['entity']] = countFeatures
                       countFeatures += 1
                    current[count, invIndex[f['entity']]] = f[rank_metric]
            count += 1
        current = np.nan_to_num(current)
        current_sparse = sparse.csr_matrix(current)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=2,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
                                                  tfidf_matrix.shape[1])
        print 'entities matrix dimension: %s x %s ' %(current_sparse.shape[0],
                                                     current_sparse.shape[1])
        print 'non zero elements in entities matrix: %s' \
              % len(current_sparse.data)

        tfidf_matrix = tfidf_matrix * 1
        entities_sparse = current_sparse * (1 - gamma)

        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['dandelion_entities'] = entities_sparse.shape[1]
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['rank_metric'] = rank_metric
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'
        del params['dtype']

        params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()

        return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\
               params
Esempio n. 19
0
class LDA(GenericModel):
    def __init__(self, **kwargs):
        self._corpus_matrix = None
        self._query_vector = None

        self.vectorizer = None
        self.lda_model = LatentDirichletAllocation(n_jobs=-1)

        super().__init__()

        self.similarity_measure = None
        self.set_basic_params(**kwargs)

        self.set_vectorizer(**kwargs)
        self.set_lda_model(**kwargs)

    def set_name(self, name):
        super().set_name(name)

    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)

    def set_basic_params(self, **kwargs):
        self.set_name('LDA' if LDA_Model_Hyperp.NAME.value not in
                      kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value])
        self.set_model_gen_name('lda')
        self.set_similarity_measure(
            sm.SimilarityMeasure.COSINE if LDA_Model_Hyperp.SIMILARITY_MEASURE.
            value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.
                                                   SIMILARITY_MEASURE.value])

    def set_similarity_measure(self, sim_measure):
        self.similarity_measure = sim_measure

    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(
            stop_words='english', use_idf=True, smooth_idf=True
        ) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys(
        ) else kwargs[LDA_Model_Hyperp.VECTORIZER.value]
        vec_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__vectorizer__' in key
        }
        self.vectorizer.set_params(**vec_params)

    def set_lda_model(self, **kwargs):
        lda_model_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__lda_model__' in key
        }
        self.lda_model.set_params(**lda_model_params)

    def recover_links(self, corpus, query, test_cases_names,
                      bug_reports_names):
        self._corpus_matrix = self.vectorizer.fit_transform(corpus)
        self._query_vector = self.vectorizer.transform(query)

        self.out_1 = self.lda_model.fit_transform(self._corpus_matrix)
        self.out_2 = self.lda_model.transform(self._query_vector)

        metric = self.similarity_measure
        if metric == sm.SimilarityMeasure.COSINE:
            self._sim_matrix = pairwise.cosine_similarity(X=self.out_1,
                                                          Y=self.out_2)
        elif metric == sm.SimilarityMeasure.JSD:
            self._sim_matrix = pairwise_distances(X=self.out_1,
                                                  Y=self.out_2,
                                                  metric=SimilarityMeasure.jsd)
        elif metric == sm.SimilarityMeasure.EUCLIDIAN_DISTANCE:
            self._sim_matrix = pairwise_distances(X=self.out_1,
                                                  Y=self.out_2,
                                                  metric='euclidean')

        #self._sim_matrix =  super().normalize_sim_matrix(self._sim_matrix)
        self._sim_matrix = pd.DataFrame(data=self._sim_matrix,
                                        index=test_cases_names,
                                        columns=bug_reports_names)

        self._record_docs_feats(corpus, query, test_cases_names,
                                bug_reports_names)

    def _record_docs_feats(self, corpus, query, test_cases_names,
                           bug_reports_names):
        self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus)
        self.mrw_brs = self._recover_mrw_list(bug_reports_names, query)

        self.dl_tcs = self._recover_dl_list(test_cases_names, corpus)
        self.dl_brs = self._recover_dl_list(bug_reports_names, query)

        index = list(test_cases_names) + list(bug_reports_names)
        self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl'])

        for tc_name, mrw in self.mrw_tcs:
            self.docs_feats_df.at[tc_name, 'mrw'] = mrw

        for tc_name, dl in self.dl_tcs:
            self.docs_feats_df.at[tc_name, 'dl'] = dl

        for br_name, mrw in self.mrw_brs:
            self.docs_feats_df.at[br_name, 'mrw'] = mrw

        for br_name, dl in self.dl_brs:
            self.docs_feats_df.at[br_name, 'dl'] = dl

    def _recover_dl_list(self, artf_names, artf_descs):
        tokenizer = PorterStemmerBased_Tokenizer()
        dl_list = []
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            dl_list.append((artf_name, len(tokenizer.__call__(artf_desc))))
        return dl_list

    def _recover_mrw_list(self, artf_names, artf_descs):
        N_REL_WORDS = 6
        mrw_list = []  # list of tuples (artf_name, mrw_list={})

        for artf_name, artf_desc in zip(artf_names, artf_descs):
            X = self.vectorizer.transform([artf_desc])
            df1 = pd.DataFrame(X.T.toarray())
            df1['token'] = self.vectorizer.get_feature_names()
            df1.sort_values(by=0, ascending=False, inplace=True)
            mrw = list(df1.iloc[0:N_REL_WORDS, 1].values)
            mrw_list.append((artf_name, mrw))

        return mrw_list

    def model_setup(self):
        return {
            "Setup": [{
                "Name": self.get_name()
            }, {
                "Similarity Measure and Minimum Threshold":
                self.get_sim_measure_min_threshold()
            }, {
                "Top Value": self.get_top_value()
            }, {
                "LDA Model": self.lda_model.get_params()
            }, {
                "Vectorizer": self.vectorizer.get_params()
            }, {
                "Vectorizer Type": type(self.vectorizer)
            }]
        }

    def get_name(self):
        return super().get_name()

    def get_model_gen_name(self):
        return super().get_model_gen_name()

    def get_similarity_measure(self):
        return self.similarity_measure

    def get_sim_matrix(self):
        return super().get_sim_matrix()

    def get_tokenizer_type(self):
        return type(self.tokenizer)

    def save_sim_matrix(self):
        super().save_sim_matrix()

    def get_query_vector(self):
        return self._query_vector

    def get_corpus_matrix(self):
        return self._corpus_matrix

    def get_vectorizer_type(self):
        return type(self.vectorizer)

    def print_topics(self):
        feature_names = self.vectorizer.get_feature_names()
        n_top_words = 10

        for topic_idx, topic in enumerate(self.lda_model.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            print(message)
Esempio n. 20
0
class LSI(GenericModel):
    def __init__(self, **kwargs):
        self._svd_matrix = None
        self._query_vector = None
        
        self.vectorizer = None
        self.svd_model = None
        
        super().__init__()
        
        self.similarity_measure = None
        
        self.set_basic_params(**kwargs)
        self.set_vectorizer(**kwargs)
        self.set_svd_model(**kwargs)
    
    def set_name(self, name):
        super().set_name(name)
    
    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)
       
    def set_basic_params(self, **kwargs):
        self.set_name('LSI' if LSI_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.NAME.value])
        self.set_similarity_measure(SimilarityMeasure.COSINE)
        self.set_model_gen_name('lsi')
    
    def set_similarity_measure(self, sim_measure):
        self.similarity_measure = sim_measure
    
    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(stop_words='english',
                                             use_idf=True, 
                                             smooth_idf=True) if LSI_Model_Hyperp.VECTORIZER.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.VECTORIZER.value]
        
        vec_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__vectorizer__' in key}
        self.vectorizer.set_params(**vec_params)
    
    def set_svd_model(self, **kwargs):
        self.svd_model = TruncatedSVD(n_components = 100, 
                                         algorithm = 'randomized',
                                         n_iter = 10, 
                                         random_state = 42) if LSI_Model_Hyperp.SVD_MODEL.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.SVD_MODEL.value]
        
        svd_model_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__svd_model__' in key}
        self.svd_model.set_params(**svd_model_params)
        
    
    def recover_links(self, corpus, query, test_cases_names, bug_reports_names):
        
        if self.similarity_measure == SimilarityMeasure.COSINE:
            self._recover_links_cosine(corpus, query, test_cases_names, bug_reports_names)
        
        elif self.similarity_measure == SimilarityMeasure.JACCARD_INDEX:
            self._recover_links_jaccard(corpus, query, test_cases_names, bug_reports_names)
        
        elif self.similarity_measure == SimilarityMeasure.EDIT_DISTANCE:
            self._recover_links_edit(corpus, query, test_cases_names, bug_reports_names)
    
        self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names)
    
    
    def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names):
        self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus)
        self.mrw_brs = self._recover_mrw_list(bug_reports_names, query)
        
        self.dl_tcs = self._recover_dl_list(test_cases_names, corpus)
        self.dl_brs = self._recover_dl_list(bug_reports_names, query)
        
        index = list(test_cases_names) + list(bug_reports_names)
        self.docs_feats_df = pd.DataFrame(index=index,
                                         columns=['mrw','dl'])
        
        for tc_name, mrw in self.mrw_tcs:
            self.docs_feats_df.at[tc_name, 'mrw'] = mrw

        for tc_name, dl in self.dl_tcs:
            self.docs_feats_df.at[tc_name, 'dl'] = dl
            
        for br_name, mrw in  self.mrw_brs:
            self.docs_feats_df.at[br_name, 'mrw'] = mrw
        
        for br_name, dl in self.dl_brs:
            self.docs_feats_df.at[br_name, 'dl'] = dl
    
    def _recover_dl_list(self, artf_names, artf_descs):
        tokenizer = WordNetBased_LemmaTokenizer()
        dl_list = []
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            dl_list.append((artf_name, len(tokenizer.__call__(artf_desc))))
        return dl_list
    
    def _recover_mrw_list(self, artf_names, artf_descs):
        N_REL_WORDS = 6
        mrw_list = [] # list of tuples (artf_name, mrw_list={})
        
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            X = self.vectorizer.transform([artf_desc])
            df1 = pd.DataFrame(X.T.toarray())
            df1['token'] = self.vectorizer.get_feature_names()
            df1.sort_values(by=0, ascending=False, inplace=True)
            mrw = list(df1.iloc[0:N_REL_WORDS,1].values)
            mrw_list.append((artf_name, mrw))
            
        return mrw_list
            
    def _recover_links_cosine(self, corpus, query, test_cases_names, bug_reports_names):
        svd_transformer = Pipeline([('vec', self.vectorizer), 
                            ('svd', self.svd_model)])

        self._svd_matrix = svd_transformer.fit_transform(corpus)
        self._query_vector = svd_transformer.transform(query)
        self._sim_matrix = pairwise.cosine_similarity(X=self._svd_matrix, Y=self._query_vector)
        
        #self._sim_matrix =  super().normalize_sim_matrix(self._sim_matrix)
        self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names)

    
    def _recover_links_jaccard(self, corpus, query, test_cases_names, bug_reports_names):
        tokenizer = self.vectorizer.tokenizer
                
        corpus_tokens = [tokenizer.__call__(doc) for doc in corpus]        
        query_tokens = [tokenizer.__call__(doc) for doc in query]
        
        self._sim_matrix = pd.DataFrame(index = test_cases_names, 
                                       columns = bug_reports_names,
                                       data = np.zeros(shape=(len(test_cases_names), len(bug_reports_names)), dtype='int8'))
        
        for br_id, doc_query_tset in zip(bug_reports_names, query_tokens):
            for tc_id, doc_corpus_tset in zip(test_cases_names, corpus_tokens):
                self._sim_matrix.at[tc_id, br_id] = nltk.jaccard_distance(set(doc_corpus_tset), set(doc_query_tset))
                
    
    def _recover_links_edit(self, corpus, query, test_cases_names, bug_reports_names):
        self._sim_matrix = pd.DataFrame(index = test_cases_names, 
                                       columns = bug_reports_names,
                                       data = np.zeros(shape=(len(test_cases_names), len(bug_reports_names)), dtype='int8'))
                
        for br_id, doc_query in zip(bug_reports_names, query):
            for tc_id, doc_corpus in zip(test_cases_names, corpus):
                self._sim_matrix.at[tc_id, br_id] = nltk.edit_distance(doc_corpus, doc_query)
        
        normalizer = Normalizer(copy=False).fit(self._sim_matrix.values)
        self._sim_matrix = pd.DataFrame(data=normalizer.transform(self._sim_matrix.values), index=test_cases_names, columns=bug_reports_names)
        
    
    def model_setup(self):
        return {"Setup" : 
                  [
                      {"Name" : self.get_name()},
                      {"Similarity Measure" : self.get_similarity_measure()},
                      {"SVD Model" : self.svd_model.get_params()},
                      {"Vectorizer" : self.vectorizer.get_params()},
                      {"Vectorizer Type" : type(self.vectorizer)}
                  ]
               }
        
    def get_query_vector(self):
        return self._query_vector
    
    def get_svd_matrix(self):
        return self._svd_matrix
    
    def get_vectorizer_type(self):
        return type(self.vectorizer)
    
    def get_tokenizer_type(self):
        return type(self.vectorizer.tokenizer)    
        
    def get_name(self):
        return super().get_name()

    def get_model_gen_name(self):
        return super().get_model_gen_name()
    
    def get_similarity_measure(self):
        return self.similarity_measure
    
    def get_sim_matrix(self):
        return super().get_sim_matrix()
        
    def save_sim_matrix(self):
        super().save_sim_matrix()
Esempio n. 21
0
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
#' And document first Is one second third this',
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)

print "get_feature_names:", vectorizer.get_feature_names()

print "len:", len(vectorizer.get_feature_names())

print "get_stop_words:", vectorizer.get_stop_words()

print "get_params:", vectorizer.get_params()

print x.shape

print "vocabulary:", vectorizer.vocabulary_

print "idf:", vectorizer.idf_

print x
often used in information retrieval and text mining. It's a statistical measure used to
evaluate how important a word is to a document in a collection or a corpus.

"""

#Create Vectorizer parameters and fit the vectorizer to synopses.
tfidf =  TfidfVectorizer(max_df = 0.8, max_features = 2000, min_df = 0.2, stop_words = 'english',  use_idf = True, 
						tokenizer = tokenization_and_stemming, ngram_range = (1,1))
tfidf_matrix = tfidf.fit_transform(synoposes)

#Save the terms identified by TF-IDF.
tf_selected_words = tfidf.get_feature_names()

#Print out the matrix, parameters and main features of the TF-IDF Vector.
print("In total, there are " + str(tfidf_matrix.shape[0]) + " synoposes and " + str(tfidf_matrix.shape[1]) + " terms.")
print("The Paramter of TFIDF Vector is: ", tfidf.get_params())
print()
print("<TFIDF-Matrix>")
print(tfidf_matrix)
print()
print("<Selected Feature Names>")
print(tf_selected_words) 
print()

#Calculate Document Similarity:
from sklearn.metrics.pairwise import cosine_similarity
cos_matrix = cosine_similarity(tfidf_matrix)
print(cos_matrix)


"""
Esempio n. 23
0
tfidf_matrix = tfidf_model.fit_transform(reviews) #fit the vectorizer to synopses

print "In total, there are " + str(tfidf_matrix.shape[0]) +       " summaries and " + str(tfidf_matrix.shape[1]) + " terms."


# In[13]:


tfidf_matrix[0]


# In[14]:


tfidf_model.get_params()


# Save the terms identified by TF-IDF.

# In[15]:


tf_selected_words = tfidf_model.get_feature_names()


# # (Optional) Calculate Document Similarity

# In[16]:

cv = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X = cv.fit_transform(corpus).toarray()
y = messages.label

## training testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

## get the parameters and feature name
cv.get_feature_names()
cv.get_params()

##final dataframwe
final_df = pd.DataFrame(X_train, columns=cv.get_feature_names())

## Creating machine learning model
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

## measuring the performance of classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(y_test, y_predict)
Esempio n. 25
0
class VSM(GenericModel):
    def __init__(self, **kwargs):
        self._terms_matrix = None
        self._query_vector = None

        self.vectorizer = None
        self.svd_model = None

        super().__init__()

        self.similarity_measure = None

        self.set_basic_params(**kwargs)
        self.set_vectorizer(**kwargs)

    def set_name(self, name):
        super().set_name(name)

    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)

    def set_basic_params(self, **kwargs):
        self.set_name('VSM' if VSM_Model_Hyperp.NAME.value not in
                      kwargs.keys() else kwargs[VSM_Model_Hyperp.NAME.value])
        self.set_similarity_measure(SimilarityMeasure.COSINE)
        self.set_model_gen_name('vsm')

    def set_similarity_measure(self, sim_measure):
        self.similarity_measure = sim_measure

    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(
            stop_words='english', use_idf=True, smooth_idf=True
        ) if VSM_Model_Hyperp.VECTORIZER.value not in kwargs.keys(
        ) else kwargs[VSM_Model_Hyperp.VECTORIZER.value]

        vec_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__vectorizer__' in key
        }
        self.vectorizer.set_params(**vec_params)

    def recover_links(self, corpus, query, test_cases_names,
                      bug_reports_names):
        starttime = time.time()
        self._recover_links_cosine(corpus, query, test_cases_names,
                                   bug_reports_names)
        self._record_docs_feats(corpus, query, test_cases_names,
                                bug_reports_names)
        endtime = time.time()
        print(
            f' ..Total processing time: {round(endtime-starttime, 2)} seconds',
        )

    def _record_docs_feats(self, corpus, query, test_cases_names,
                           bug_reports_names):
        self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus)
        self.mrw_brs = self._recover_mrw_list(bug_reports_names, query)

        self.dl_tcs = self._recover_dl_list(test_cases_names, corpus)
        self.dl_brs = self._recover_dl_list(bug_reports_names, query)

        index = list(test_cases_names) + list(bug_reports_names)
        self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl'])

        for tc_name, mrw in self.mrw_tcs:
            self.docs_feats_df.at[tc_name, 'mrw'] = mrw

        for tc_name, dl in self.dl_tcs:
            self.docs_feats_df.at[tc_name, 'dl'] = dl

        for br_name, mrw in self.mrw_brs:
            self.docs_feats_df.at[br_name, 'mrw'] = mrw

        for br_name, dl in self.dl_brs:
            self.docs_feats_df.at[br_name, 'dl'] = dl

    def _recover_dl_list(self, artf_names, artf_descs):
        tokenizer = WordNetBased_LemmaTokenizer()
        dl_list = []
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            dl_list.append((artf_name, len(tokenizer.__call__(artf_desc))))
        return dl_list

    def _recover_mrw_list(self, artf_names, artf_descs):
        N_REL_WORDS = 6
        mrw_list = []  # list of tuples (artf_name, mrw_list={})

        for artf_name, artf_desc in zip(artf_names, artf_descs):
            X = self.vectorizer.transform([artf_desc])
            df1 = pd.DataFrame(X.T.toarray())
            df1['token'] = self.vectorizer.get_feature_names()
            df1.sort_values(by=0, ascending=False, inplace=True)
            mrw = list(df1.iloc[0:N_REL_WORDS, 1].values)
            mrw_list.append((artf_name, mrw))

        return mrw_list

    def _recover_links_cosine(self, corpus, query, test_cases_names,
                              bug_reports_names):
        transformer = Pipeline([('vec', self.vectorizer)])

        self._terms_matrix = transformer.fit_transform(corpus)
        self._query_vector = transformer.transform(query)
        self._sim_matrix = pairwise.cosine_similarity(X=self._terms_matrix,
                                                      Y=self._query_vector)

        #self._sim_matrix =  super().normalize_sim_matrix(self._sim_matrix)
        self._sim_matrix = pd.DataFrame(data=self._sim_matrix,
                                        index=test_cases_names,
                                        columns=bug_reports_names)

    def model_setup(self):
        return {
            "Setup": [{
                "Name": self.get_name()
            }, {
                "Similarity Measure": self.get_similarity_measure()
            }, {
                "Vectorizer": self.vectorizer.get_params()
            }, {
                "Vectorizer Type": type(self.vectorizer)
            }]
        }

    def get_query_vector(self):
        return self._query_vector

    def get_terms_matrix(self):
        return self._terms_matrix

    def get_vectorizer_type(self):
        return type(self.vectorizer)

    def get_tokenizer_type(self):
        return type(self.vectorizer.tokenizer)

    def get_name(self):
        return super().get_name()

    def get_model_gen_name(self):
        return super().get_model_gen_name()

    def get_similarity_measure(self):
        return self.similarity_measure

    def get_sim_matrix(self):
        return super().get_sim_matrix()

    def save_sim_matrix(self):
        super().save_sim_matrix()
Esempio n. 26
0
def train(x_train, y_train, x_test, y_test):

    ######################## INIT ###################################
    print("\nINITIALIZING CLASSIFIER...")

    # vectorizer = TfidfVectorizer(ngram_range=(1,2))
    vectorizer = TfidfVectorizer()
    print("vectorizer params:", vectorizer.get_params())

    linear_svc = svm.LinearSVC()
    print("linear svc params", linear_svc.get_params())

    linear_svc_pipeline = Pipeline(
        steps=[("vectorizer", vectorizer), ("linear_svc", linear_svc)])

    ################### CROSS VAL and GRID SEARCH ####################
    print("\nPERFORMING GRID SEARCH WITH CROSS VALIDATION...")
    k_fold = KFold(n_splits=20, shuffle=True, random_state=1)
    # k_fold = KFold(n_splits=5, shuffle=True)
    linear_svc_params = [
        {  # Dual optimization
            "linear_svc__penalty": ["l2"],  # if l1, you can't use hinge
            "linear_svc__loss": ["hinge", "squared_hinge"],
            "linear_svc__dual": [True],  # if false, you can't use l2 or hinge
            # "linear_svc__tol": [1e-4, 1e-5],
            # "linear_svc__C":[0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "linear_svc__C": [0.1, 1],
            # "linear_svc__multi_class": ["ovr", "crammer_singer"],
            # "vectorizer__stop_words": [None, stop_words.ENGLISH_STOP_WORDS],
            "vectorizer__ngram_range": [(1, 2), (1, 3)],
            "vectorizer__max_df": [0.9, 1.0],
            # "vectorizer__use_idf": [True, False]
        },
        {  # Primal Optimization
            "linear_svc__penalty": ["l1", "l2"],
            "linear_svc__loss": ["squared_hinge"],
            "linear_svc__dual": [False],
            # "linear_svc__tol": [1e-4, 1e-5],
            # "linear_svc__C":[0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "linear_svc__C": [0.1, 1],
            # "linear_svc__multi_class": ["ovr", "crammer_singer"],
            # "vectorizer__stop_words": [None, stop_words.ENGLISH_STOP_WORDS],
            "vectorizer__ngram_range": [(1, 2), (1, 3)],
            "vectorizer__max_df": [0.9, 1.0],
            # "vectorizer__use_idf": [True, False]
        }
        # ,
        # {
        # 	# "linear_svc__penalty": ["l2"],		# if l1, you can't use hinge
        # 	"linear_svc__loss": ["hinge", "squared_hinge"],
        # 	# "linear_svc__dual": [True],	# if false, you can't use l2 or hinge
        # 	"linear_svc__tol": [1e-4, 1e-5],
        # 	"linear_svc__C":[0.1, 1, 10],
        # 	# "vectorizer__stop_words": [None, stop_words.ENGLISH_STOP_WORDS]
        # 	"vectorizer__ngram_range": [(1,1), (1,2), (1,3)],
        # 	"vectorizer__max_df": [0.9, 1.0]
        # 	# "vectorizer__use_idf": [True, False]
        # 	# "linear_svc__multi_class": ["ovr", "crammer_singer"]
        # }
    ]

    scores = ["precision_micro", "recall_micro", "f1_micro", "accuracy", None]

    for score in scores:
        print("# Tuning hyper-parameters for {0}".format(score))
        print()

        grd = GridSearchCV(linear_svc_pipeline,
                           param_grid=linear_svc_params,
                           cv=k_fold,
                           scoring=score)
        grd.fit(x_train, y_train)

        print("\nBest score and parameters set found on development set:")
        print("Score:", grd.best_score_, "Params:", grd.best_params_)
        print()

        print("All grid scores on development set:")
        means = grd.cv_results_["mean_test_score"]
        stds = grd.cv_results_["std_test_score"]
        for mean, std, params in zip(means, stds, grd.cv_results_["params"]):
            print("{0:0.3f} (+/-{1:0.3f}) for {2}".format(
                mean, std * 2, params))
        print()

        print("Detaiiled classification report:")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        y_true, y_pred = y_test, grd.predict(x_test)
        print(
            classification_report(
                y_true,
                y_pred,
                target_names=["negative", "neutral", "positive"]))
        print()

        print("Confusion matrix:")
        print(confusion_matrix(y_true, y_pred))
        print()
        print()
Esempio n. 27
0
class CommentsAnalyzer(pmlutil.Configurable):
    
    def configTypes(self):
        return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int)

    def _loadData(self):
        logging.info("loading data")
        self.data = []
        count = 0
        for fn in os.listdir(self._datafolder):
            if not self._amount < 1 and count >= self._amount:
                break
            if fn.endswith(self._metaextension):
                mfn = self._datafolder + "/" + fn
                ddm = pml.Datum(mfn,None)
                if len(ddm.meta()['comments'])>0:
                    self.data.append(ddm)
                    count +=1
        logging.info("loaded %d data" % count)

    def __init__(self):
        self.data=[]

    def _aggregateComments(self, subset):
        allcomments = []
        for datum in subset:
            comments = []
            for comment in datum.meta()['comments']:
                comments.append(comment['text'])
            allcomments.append(" ".join(comments))
        return np.array(allcomments)

    def _buildDictionary(self, allcomments):
        print allcomments
        self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram),
                                     min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf))
        self.vectorizer.fit(allcomments)

    def run(self):
        allcomments = self._aggregateComments(self.data)
        self._buildDictionary(allcomments)

        # create representation of documents
        tfidfArray = self.vectorizer.transform(allcomments)

        # create labelling
        labels = []
        for datum in self.data:
            labels.append(len(datum.meta()['favorites']))
        labels = np.array(labels)

        print self.vectorizer.get_params()
        print self.vectorizer.get_feature_names()

        # training
        self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
        self.elasticNet.fit(tfidfArray,labels)

        for i,l1_ratio in enumerate(self._l1_ratio):
            for j,alpha in enumerate(self._alpha):
                print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))

        print self.vectorizer.inverse_transform(self.elasticNet.coef_)
Esempio n. 28
0
class TextRegressor:
    param_defaults = {'min_df': 1, 'c_ngmin': 1, 'c_ngmax': 1,
                      'w_ngmax': 1, 'w_ngmin': 1, 'lowercase': 'word',
                      'alpha': 1.0, 'C': 1.0, 'mix': 1.0}
    def __init__(self, regressor='ridge', vectorizer='tf-idf'):
        if regressor == 'ridge':
            from sklearn.linear_model import Ridge
            self.reg = Ridge()
        elif regressor == 'SVR':
            from sklearn.svm import SVR
            self.reg = SVR()
        elif regressor == 'linearsvr':
            from sklearn.svm import LinearSVR
            self.reg = LinearSVR()
        if vectorizer == 'tf-idf':
            from sklearn.feature_extraction.text import TfidfVectorizer
            self.vec = TfidfVectorizer()
        self.vec_params_default = self.vec.get_params()
        self.reg_params_default = self.reg.get_params()
        self._reset()

    def _reset(self):
        self.par = dict(self.param_defaults)
        self.vec_params = self.vec_params_default
        self.vec.set_params(**self.vec_params)
        self.reg_params = self.reg_params_default
        self.reg.set_params(**self.reg_params)

    def set_params(self, **params):
        self._reset()
        self.par.update(params)
        ngram_analyzer = DocAnalyzer(
                    lowercase=self.par.get('lowercase'),
                    c_ngmin=self.par.get('c_ngmin'),
                    c_ngmax=self.par.get('c_ngmax'),
                    w_ngmin=self.par.get('w_ngmin'),
                    w_ngmax=self.par.get('w_ngmax'))
        self.vec_params.update(
            {k:self.par[k] for k in self.par.keys() & self.vec_params.keys()})
        self.vec.set_params(**self.vec_params)
        self.vec.set_params(analyzer=ngram_analyzer)
        self.reg_params.update(
            {k:self.par[k] for k in self.par.keys() & self.reg_params.keys()})
        self.reg.set_params(**self.reg_params)

    def get_params(self):
        return self.par

    def fit(self, text, outcome):
        num = None
        if len(text) == 2:
            text, num = text
        x = self.vec.fit_transform(text)
        if num is not None:
            x = hstack((x, self.par['mix'] * num), format='csr')
        self.reg.fit(x, outcome)

    def predict(self, text,
                gold=None, gold_rank=None, rank_dir=-1, return_score=False):
        num = None
        if len(text) == 2:
            text, num = text
        x = self.vec.transform(text)
        if num is not None:
            x = hstack((x, self.par['mix'] * num), format='csr')
        pred = self.reg.predict(x)
        if return_score:
            return pred, self._score(gold, pred, gold_rank, rank_dir)
        else:
            return pred

    def _score(self, gold, pred, gold_rank=None, rank_dir=-1,
            verbose=False):
        r2 = r2_score(gold, pred)
        rmse = np.sqrt(mean_squared_error(gold, pred))
        if gold_rank is None:
            gold_rank = rankdata(rank_dir * gold, method='ordinal')
        pred_rank = rankdata(rank_dir * pred, method='ordinal')
        corr, _ = pearsonr(gold, pred)
        rank_corr, _ = pearsonr(gold_rank, pred_rank)
        if verbose:
            fmt = ("{}: n={}, min={:.4f}, max={:.4f}, mean={:.4f}, "
                   "var={:.4f}, skew={:.4f}, kurtosis={:.4f}")
            gold_dsc = describe(gold)
            pred_dsc = describe(pred)
            print(fmt.format('gold',
                gold_dsc[0], *gold_dsc[1], *gold_dsc[2:]))
            print(fmt.format('pred',
                pred_dsc[0], *pred_dsc[1], *pred_dsc[2:]))
        return {'r2': r2, 'rmse': rmse, 'rank_corr': rank_corr, 'corr': corr}

    def score(self, text, gold, gold_rank=None, rank_dir=-1,
            verbose=False):
        pred = self.predict(text)
        return self._score(gold, pred, gold_rank, rank_dir,
                verbose=verbose)
Esempio n. 29
0
nltk.download('stopwords')
limit = []
for i in range(0, len(stringTotal)):
    review = re.sub('[^a-zA-Z]', ' ', stringTotal['text'][i])
    review = review.lower()
    review = review.split()
    review = [
        singleStem.stem(word) for word in review
        if not word in stopwords.words('english')
    ]
    review = ' '.join(review)
    limit.append(review)

vector = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

vector.get_params()

xAxis = vector.fit_transform(limit).toarray()

yAxis = stringTotal['fact']

X_train, X_test, y_train, y_test = train_test_split(xAxis,
                                                    yAxis,
                                                    test_size=0.2,
                                                    random_state=0)

data_count = pd.DataFrame(X_train, columns=vector.get_feature_names())

data_count.head(3)

identifier = PassiveAggressiveClassifier(max_iter=1000)
    def get_data_only_with_entities(self, relevance_threshold=0.75, gamma=0.89, filter=False):
        data = self.mongo.get_all(order_by='id_doc')

        data = [doc for doc in data]
        only_text = [doc['text'] for doc in data]

        ent_dict, ent_set = self.get_dandelion_entities(data)

        if filter:
            entities_set = set([k for k, v in ent_dict.iteritems()])
        else:
            entities_set = ent_set
        entities = {e: i for i, e in enumerate(entities_set)}
        dandelion_entities = np.zeros((len(data), len(entities_set)))

        for doc in data[:]:
            text = doc['text']
            if 'dandelion' in doc:
                for e in doc['dandelion']['annotations']:
                    rel = np.float64(e['confidence'])
                    name = e['title']
                    if rel > relevance_threshold:
                        dandelion_entities[doc['id_doc']][entities[name]] = rel

        entities_sparse = sparse.csr_matrix(dandelion_entities)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=2,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)

        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)

        print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0],
                                                  tfidf_matrix.shape[1])
        print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0],
                                                     entities_sparse.shape[1])
        print 'non zero elements in entities matrix: %s' \
              % len(entities_sparse.data)

        '''print tfidf_matrix[tfidf_matrix > 0].mean()
        print tfidf_matrix[tfidf_matrix > 0].max()

        print entities_sparse[entities_sparse > 0].mean()
        print entities_sparse[entities_sparse > 0].max()
        print '#' * 80'''
        #print 'after balancing'

        tfidf_matrix = tfidf_matrix * 1
        entities_sparse = entities_sparse * (1 - gamma)

        #print tfidf_matrix[tfidf_matrix > 0].mean()
        #print tfidf_matrix[tfidf_matrix > 0].max()

        #print entities_sparse[entities_sparse > 0].mean()
        #print entities_sparse[entities_sparse > 0].max()

        f_score_dict = self.labels_dict(data)
        params = tfidf_vectorizer.get_params()
        params['dandelion_entities'] = entities_sparse.shape[1]
        params['original_terms'] = tfidf_matrix.shape[0]
        params['gamma'] = gamma
        params['relevance_threshold'] = relevance_threshold
        params['classes'] = len(f_score_dict)
        params['tokenizer'] = 'TextUtils.tokenize_and_stem'
        del params['dtype']

        params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean()

        return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\
               params
class CharTfidfTagger(BaseEstimator):
    def __init__(self, **kwargs):
        """
        Character-based tfidf sequence tagger.

        Examples
        --------
        >>> model = CharTfidfTagger()
        >>> model.fit([["token1", "token2"]], [["A", "B"]])
        >>> model.predict([["token1", "token2"]])
        [['A', 'B']]
        """
        self.tagger = sklearn_crfsuite.CRF()
        self.tfidf = TfidfVectorizer(analyzer="char_wb", strip_accents="ascii")

        self.set_params(**kwargs)

    def set_params(self, **params):
        self.tfidf.set_params(
            **{
                k.split("__", 1)[-1]: v
                for k, v in params.items() if k.startswith("tfidf__")
            })
        self.tagger.set_params(
            **{
                k.split("__", 1)[-1]: v
                for k, v in params.items() if k.startswith("tagger__")
            })

    def get_params(self, deep=True):
        params = {
            "tfidf__" + k: v
            for k, v in self.tfidf.get_params(deep).items()
        }
        params.update({
            "tagger__" + k: v
            for k, v in self.tagger.get_params(deep).items()
        })
        return params

    def fit(self, X, y):
        corpus = [" ".join(example) for example in X]
        self.tfidf.fit(corpus)
        features = [self.featurize(example) for example in X]
        self.tagger.fit(features, y)

    def predict(self, X):
        features = [self.featurize(example) for example in X]
        return self.tagger.predict(features)

    def score(self, X, y):
        predictions = self.predict(X)
        return sklearn_crfsuite.metrics.flat_f1_score(y,
                                                      predictions,
                                                      average="macro")

    def featurize(self, tokens) -> List[Dict[str, Any]]:
        return [
            dict(
                zip(
                    self.tfidf.get_feature_names(),
                    self.tfidf.transform([token]).toarray().reshape(-1),
                )) for token in tokens
        ]