Example #1
0
class Recommender(object):
    '''
    A class to house the text vectorizer and stacked Naive Bayes/Random Forest 
    Classifiers that form the heart of this wine recommender.
    '''
    def __init__(self):
        self.nb = ComplementNB()
        self.rf = RandomForestClassifier()
        self.vecto = TfidfVectorizer()

    def _fit(self, data):
        '''
        Takes in the data for the recommender to be trained and fit to.

        Parameters
        ----------
        data - The filepath to the data being fit.

        Returns
        ----------
        None
        '''

        wrangler = Data_Handler(data)
        df = wrangler.get_top_num(15)
        X = df['description']
        y = df['variety']

        X = self.vecto.fit_transform(X)
        self.nb.fit(X, y)
        X = self.nb.predict_proba(X)

        self.rf.fit(X, y)

    def predict(self, text):
        '''
        Takes in a single input of tasting notes and runs it through our
        vectorizer and ensemble method to return the top five predicted
        varieties.

        Parameters
        ----------
        text - str - The input tastings notes.

        Returns
        ----------
        top_five - lst -  The top five predicted varieties for recommendation.
        '''

        vect = self.vecto.transform([text])
        probs = self.nb.predict_proba(vect)
        probs = self.rf.predict_proba(probs)[0]
        idx = np.argsort(probs)
        top_five_idx = idx[-1:-6:-1]
        top_five = self.rf.classes_[top_five_idx]
        return top_five
Example #2
0
def findBestFitCluster(orphanCorpus, corpusCluster={}):
    """
    Given a set of questions without a cluster and a set of other clusters, find the best cluster to put the orphaned questions
    Parameters:
        orphanCorpus (tagged_question_corpus.TaggedQuestionCorpus): corpus of the questions without a cluster.
        corpusCluster ({tagged_question_corpus.TaggedQuestionCorpus}): Object containing different clusters and their corpuses

    Returns:
        xxx
    """

    # corpusCluster = {
    #     "questions": [ 'and the moon too guys', 'lets show some or a lot of love for the moon!!' ],
    #     "question_vectors": [[], []],
    #     "clusterIds": [ '4', '4' ]
    # }

    # orphanCorpus = [ {
    #         "id": 11, "question": 'Another one about the sun?', "question_vector": []
    #     },
    #     {
    #         "id": 33,
    #         "question": 'What is the distance from the sun though?', "question_vector": [] },
    #     {
    #         "id": 37,
    #         "question": 'what\'s the changing factors of the sun and moon together?', "question_vector": []
    # } ]

    # Fit the Naive bayes model on existing clusters
    clf = ComplementNB()
    clf.fit(corpusCluster["question_vectors"], corpusCluster["clusterIds"])

    predictions = clf.predict_proba(
        [doc["question_vector"] for doc in orphanCorpus])
Example #3
0
class ComplementNBImpl():
    def __init__(self,
                 alpha=1.0,
                 fit_prior=True,
                 class_prior=None,
                 norm=False):
        self._hyperparams = {
            'alpha': alpha,
            'fit_prior': fit_prior,
            'class_prior': class_prior,
            'norm': norm
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
def complement_bayes(x_train,x_test,y_train,y_test,X,fl,amostra_paci3,fl_a3,nome):
    Complement=ComplementNB()
    Complement.fit(x_train,y_train)
    pred=Complement.predict_proba(x_train)
    amostra_=Complement.predict_proba(amostra_paci3)
    amostra_2=Complement.predict(amostra_paci3)
    amostra_paci3['result']=0
    amostra_paci3['probls']=0
    amostra_paci3['probls']=amostra_
    amostra_paci3['result']=amostra_2
    amostra_paci3['fl_severidade']=fl_a3
    amostra_paci3.to_csv('modelo_complement_bayes.csv')
    print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train,pred[:,1])))
    pred_2=Complement.predict_proba(x_test)
    print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test,pred_2[:,1])))
    #print(Complement.predict_proba(X))
    yhat = Complement.predict_proba(X)
    yhat = yhat[:, 1] 
    print(pd.crosstab(fl, Complement.predict(X)))
    print(classification_report(fl, Complement.predict(X)))
    print('AUC: %0.2f' % roc_auc_score(fl,yhat))
    plot_roc_curve(fl,yhat,nome)
Example #5
0
class _ComplementNBImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
def cluster_by_keyword(keyword=None,
                       questions=[],
                       questionIds=[],
                       analyzer="word"):
    """
    Finds the keyword question pairs using complement naive bayes algorithm by first tfidf vectorizing the list of questions (corpus) and trying to find the questions with the highest likelihood of belonging to the keyword cluster.

    Parameters:
        keyword (string): A string title of the cluster we are trying to look for
        questions (list(string)): The list of the question string text
        questionIds (list(number)): A list containing the ids of the questions passed as the            questions parameter. These Ids are what is returned per identified cluster.
        analyser (string): Either "word", "char", or "char_wb"
            - Use word for identifying whole words and drop potential support for spelling errors, but support stop words and finding exact words.
            - Use char to identify ngrams based on a sequence of characters, does not support stop words and can return less than ideal clusters compositions, but supports spelling mistakes and incomplete keywords
            - Use char_wb to have character level ngrams that do not go beyond word boundaries. Extra padding is added to last characters of words when making ngrams.

    Returns:
        list of cluster tupples (list[(qnId, probability)]): List of tuples containing question Ids 
            and the probability of the question belonging to this cluster
        threshold (number): The dynamic threshold that was used as a cut off point.
    """
    keyword_len = len(keyword)
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.4,
                                 smooth_idf=True,
                                 stop_words="english",
                                 ngram_range=(1, 3),
                                 analyzer=analyzer,
                                 use_idf=True,
                                 lowercase=True)
    vectorized_questions = vectorizer.fit_transform(questions)

    cnb_clf = ComplementNB(alpha=1, fit_prior=False)
    cnb_clf.fit(vectorized_questions, questionIds)

    keyword = vectorizer.transform([keyword])
    probabilities = cnb_clf.predict_proba(keyword)
    # threshold = np.var(probabilities) + np.median(probabilities)
    threshold = np.std(probabilities) + np.median(probabilities) + np.var(
        probabilities)
    return [(questionIds[ix], p) for ix, p in enumerate(probabilities[0])
            if p > threshold], threshold
Example #7
0
def code_from_feedbacks(inputs, clsf, lng, level):
    # does the same stuff as in 'code'
    inputs = prepare_input(inputs, lng)
    tf = TfidfVectorizer(analyzer="word",
                         ngram_range=(1, 2),
                         min_df=0,
                         sublinear_tf=True)

    # filter data
    train = Feedback.objects.filter(classification=clsf,
                                    level=level,
                                    language=lng)

    # if no feedbacks...return empty array of arrays
    if len(train) < 2:
        return [[] for i in inputs]

    train_text = [unidecode(t.text) for t in train]
    train_codes = [t.code for t in train]
    X = tf.fit_transform(train_text)

    # finally, model
    # complement naive bayes
    model = ComplementNB()
    model.fit(X, train_codes)
    inputs = [unidecode(i) for i in inputs]
    inputs_tf = tf.transform(inputs)
    output = model.predict(inputs_tf)
    probs = model.predict_proba(inputs_tf)

    output2 = [[out] for out in output]
    for i in range(0, len(inputs)):
        dif = max(probs[i]) - min(probs[i])
        if (dif == 0):
            output2[i] = []

    return output2
#text = 'The movie has a pleasant start, and that is all to it. It has such an ordinary story line that you can predict the next scene. It just getting more silly and stupid with time. By the end of the watch, you will realise what a complete waste of time it was.It is such a dumb watch. Well, it was a yawn-inducing, bland and senseless movie. Not recommended. One-star from me.'#negative
#text = 'The movie had a very original start and it was comparative slow. But in my opinion, it had a thought-provoking idea. It gets uninteresting and tiresome in the middle, and a little predictable. But on the whole, it was unpretentious and tender. 3-star from me.'#neutral

text = text.lower()
text = word_tokenize(text)
stop_words = set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()
text = [
    wordnet_lemmatizer.lemmatize(word, pos="v") for word in text
    if not word in stop_words and not word in string.punctuation
]
print(text)
x_test = vectorizer.transform(text)
y_pred = model.predict(x_test)
label = model.classes_
prob = model.predict_proba(x_test)
prob = [True if np.amax(ele) > 0.60 else False for ele in prob]
y_pred1 = [y_pred[i] if ele else '' for i, ele in enumerate(prob)]
print(y_pred1)
y_pred = [y_pred[i] for i, ele in enumerate(prob) if ele]
y_pred = [y_pred.count(ele) for ele in label]
print(label)
print(y_pred)
y_pred = y_pred - np.amax(y_pred)
y_pred = [True if ele >= 0 else False for ele in y_pred]
print(y_pred)
if sum(y_pred) == 1:
    result = [label[i] for i, ele in enumerate(y_pred) if ele]
else:
    result = 'Neutral'
print(result)
Example #9
0
class Classifier:
    def __init__(self, max_df=0.80, max_features=6500):
        self.count_vect = TfidfVectorizer(max_df=max_df,
                                          stop_words='english',
                                          max_features=max_features,
                                          use_idf=True)
        self.cnb = ComplementNB()
        np.random.seed(2222)

    def __fit(self):
        self.cnb.fit(self.x_train, self.train_set['category'])

    # Calling this method just after object creation is required in order to set up data
    # Attribute test_size specifies the magnitude of the test set
    def set_data(self, dataset: pd.DataFrame, labels: list, test_size=0.25):
        self.train_set, self.test_set = train_test_split(dataset,
                                                         test_size=test_size)
        self.x_train = self.count_vect.fit_transform(self.train_set['text'])
        self.labels = labels
        self.__fit()

    # This method returns the predicted label for the text provided
    def predict(self, text: str):
        txt = TextTools()
        text = txt.preprocess(text)
        feats = self.count_vect.transform([text])
        return self.cnb.predict(feats)

    # This method returns a matrix of probabilities computet by Complement Naive Bayes
    def get_predict_proba(self, text: str):
        feats = self.count_vect.transform([text])
        predictions = {
            'label': (self.cnb.predict(feats))[0],
            'features': self.cnb.predict_proba(feats)
        }
        return predictions

    # This method returns the f1-score
    def get_score(self):
        x_test = self.count_vect.transform(self.test_set['text'])
        y_test_pred = self.cnb.predict(x_test)
        return f1_score(self.test_set['category'],
                        y_test_pred,
                        average=None,
                        labels=self.labels).mean()

    # This method plots the confusion matrix
    def get_cmatrix(self):
        x_test = self.count_vect.transform(self.test_set['text'])
        y_test_pred = self.cnb.predict(x_test)
        disp = plot_confusion_matrix(self.cnb,
                                     x_test,
                                     self.test_set['category'],
                                     display_labels=self.labels,
                                     cmap=plt.cm.Blues,
                                     normalize='true')
        plt.show()

    # This method computes the cosine similarity between item1 and item2
    # item[1,2] must be array-like
    def similarity(self, item1, item2):
        return cosine(item1, item2)
Example #10
0
class ProbabilisticValidator():
    """
    # The probabilistic validator is a quick to train model used for validating the predictions of our main model
    # It is fit to the results our model gets on the validation set
    """
    _smoothing_factor = 0.5  # TODO: Autodetermine smotthing factor depending on the info we know about the dataset
    _probabilistic_model = None
    _X_buff = None
    _Y_buff = None

    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        self._X_buff = []
        self._Y_buff = []
        self._predicted_buckets_buff = []
        self._real_buckets_buff = []
        self._original_real_buckets_buff = []
        self._original_predicted_buckets_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self._probabilistic_model = MultinomialNB(
                alpha=self._smoothing_factor)

            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]

            if len(self.buckets) < 3:
                self._probabilistic_model = ComplementNB(
                    alpha=self._smoothing_factor)
        else:
            self._probabilistic_model = ComplementNB(
                alpha=self._smoothing_factor)

            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {}

    def register_observation(self,
                             features_existence,
                             real_value,
                             predicted_value,
                             is_original_data=False,
                             hmd=None):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        try:
            predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(
                predicted_value)
        except:
            predicted_value = None

        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(
                str(real_value).replace(',', '.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats, hmd)
            real_value_b = get_value_bucket(real_value, self.buckets,
                                            self.col_stats, hmd)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence

            self._X_buff.append(X)
            self._Y_buff.append(real_value_b)
            self._real_buckets_buff = self._Y_buff
            self._predicted_buckets_buff.append(predicted_value_b)

            if is_original_data:
                self._original_real_buckets_buff.append(real_value_b)
                self._original_predicted_buckets_buff.append(predicted_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            nr_missing_features = len(
                [x for x in features_existence if x in (False, 0)])
            if nr_missing_features == 0:
                if real_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[real_value_b] = []
                self.bucket_accuracy[real_value_b].append(
                    int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self._X_buff.append(features_existence)
            self._Y_buff.append(real_value_b == predicted_value_b)
            self._real_buckets_buff.append(real_value_b)
            self._predicted_buckets_buff.append(predicted_value_b)

            if is_original_data:
                self._original_real_buckets_buff.append(real_value_b)
                self._original_predicted_buckets_buff.append(predicted_value_b)

    def get_accuracy_histogram(self):
        x = []
        y = []

        total_correct = 0
        total_vals = 0

        buckets_with_no_observations = []
        for bucket in range(len(self.buckets)):
            try:
                total_correct += sum(self.bucket_accuracy[bucket])
                total_vals += len(self.bucket_accuracy[bucket])
                y.append(
                    sum(self.bucket_accuracy[bucket]) /
                    len(self.bucket_accuracy[bucket]))
            except:
                # If no observations were made for this bucket
                buckets_with_no_observations.append(bucket)
                y.append(None)

            x.append(bucket)

        validation_set_accuracy = total_correct / total_vals
        for bucket in buckets_with_no_observations:
            y[x.index(bucket)] = validation_set_accuracy

        return {'buckets': x, 'accuracies': y}, validation_set_accuracy

    def partial_fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')

        if self.buckets is not None:
            self._probabilistic_model.partial_fit(self._X_buff,
                                                  self._Y_buff,
                                                  classes=self.bucket_keys)
        else:
            self._probabilistic_model.partial_fit(self._X_buff,
                                                  self._Y_buff,
                                                  classes=[True, False])

        np.seterr(divide=log_types['divide'])

        self._X_buff = []
        self._Y_buff = []

    def fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')
        self._probabilistic_model.fit(self._X_buff, self._Y_buff)
        np.seterr(divide=log_types['divide'])

        self._X_buff = []
        self._Y_buff = []

    def get_confusion_matrix(self):
        # The rows represent predicted values
        # The "columns" represent real values
        labels = list(set(self._original_real_buckets_buff))

        matrix = confusion_matrix(self._original_real_buckets_buff,
                                  self._original_predicted_buckets_buff,
                                  labels=labels)

        value_labels = []
        for label in labels:
            try:
                value_labels.append(str(self.buckets[label]))
            except:
                value_labels.append('UNKNOWN')

        confusion_matrix_obj = {
            'matrix': [[int(y) for y in x] for x in matrix],
            'predicted': value_labels,
            'real': value_labels
        }
        return confusion_matrix_obj

    def evaluate_prediction_accuracy(self, features_existence,
                                     predicted_value):
        """
        # Fit the probabilistic validator on an observation
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        distribution = self._probabilistic_model.predict_proba(np.array(X))[0]
        distribution = distribution.tolist()

        if len([x for x in distribution if x > 0.01]) > 4:
            # @HACK
            mean = np.mean(distribution)
            std = np.std(distribution)

            distribution = [x if x > (mean - std) else 0 for x in distribution]

            sum_dist = sum(distribution)
            # Avoid divison by zero in certain edge cases
            sum_dist = 0.00001 if sum_dist == 0 else sum_dist
            distribution = [x / sum_dist for x in distribution]

            min_val = min([x for x in distribution if x > 0.001])
            distribution = [
                x - min_val if x > min_val else 0 for x in distribution
            ]

            sum_dist = sum(distribution)
            # Avoid divison by zero in certain edge cases
            sum_dist = 0.00001 if sum_dist == 0 else sum_dist
            distribution = [x / sum_dist for x in distribution]
            # @HACK
        else:
            pass

        return ProbabilityEvaluation(self.buckets, distribution,
                                     predicted_value)
        r_train = encoded_r[train_index]
        module_train = df.MODULE.to_numpy()[train_index].reshape(-1, 1)

        desc_test = vectored_desc[test_index]
        r_test = encoded_r[test_index]
        module_test = df.MODULE.to_numpy()[test_index].reshape(-1, 1)

        # 训练NB DT:
        cnb = ComplementNB()
        cnb.fit(desc_train, module_train)

        dt = DecisionTreeClassifier()
        dt.fit(r_train, module_train)

        # 计算概率:
        desc_proba = cnb.predict_proba(desc_test)
        r_proba = dt.predict_proba(r_test)

        # 为了避免模块问题单出现频率对推荐结果造成影响,转换为0-1的向量,只做物理隔离:
        r_proba[r_proba > 0.000001] = 1
        joint_proba = desc_proba * r_proba

        # 推荐top3:
        item_tree = predict_top_k(joint_proba, cnb.classes_)
        item_none_tree = predict_top_k(desc_proba, cnb.classes_)

        # 计算accuracy:
        acc1 = accuracy(module_test, item_tree)
        acc2 = accuracy(module_test, item_none_tree)
        accs_with_tree.append(acc1)
        accs_without_tree.append(acc2)
def evaluate_partitions(keep_bin_edges, df_processed):
    """ This function evaluates a lightweight classifier according to the thresholds.
        Inputs are a list of bin-edges for the continuous target and the processed df.
    """
    # initialize the empty lists
    accs = []
    aucs = []
    mccs = []
    apcs = []

    accs_control = []
    aucs_control = []
    mccs_control = []
    apcs_control = []

    threshs = []
    bin_pct = []

    # starting data percentile
    pct = 0.0
    # binning parameters fixed - DO NOT CHANGE
    num_bins = 10
    num_trials = 10
    # sweep through all bin edges
    for bin_edge in keep_bin_edges:

        threshold = bin_edge
        # obtain the X,y matrices
        X, X_control, y = partition_data(df_processed, threshold)
        # starting data percentile
        pct += 1 / num_bins
        for trial in range(num_trials):
            # get the training, testing, and control data-sets
            x_train_idf, y_train, x_test_idf, y_test, x_control_idf = split_transform_data(
                X, X_control, y)
            # fit the classifier
            clf = ComplementNB(alpha=0.1,
                               class_prior=None,
                               fit_prior=True,
                               norm=False)
            clf.fit(x_train_idf, y_train)

            # evaluate on test and control sets
            accs.append(clf.score(x_test_idf, y_test))
            accs_control.append(clf.score(x_control_idf, y))

            y_pred = clf.predict(x_test_idf)
            y_pred_cont = clf.predict(x_control_idf)

            mccs.append(mcc(y_test, y_pred))
            mccs_control.append(mcc(y, y_pred_cont))

            y_proba = clf.predict_proba(x_test_idf)
            y_cont_proba = clf.predict_proba(x_control_idf)

            aucs.append(roc_auc_score(y_test, y_proba[:, 1]))
            aucs_control.append(roc_auc_score(y, y_cont_proba[:, 1]))

            apcs.append(apscore(y_test, y_proba[:, 1]))
            apcs_control.append(apscore(y, y_cont_proba[:, 1]))

            threshs.append(threshold)
            bin_pct.append(pct)

    # populate into a df for downstream analysis
    df_eval = pd.DataFrame()
    df_eval['data percentile'] = bin_pct  # data percentile
    df_eval['threshold'] = threshs  # bin edge
    df_eval['test accuracy'] = accs  # accuracy
    df_eval['test mcc'] = mccs  # matthews correlation coefficient
    df_eval['test auc'] = aucs  # roc-auc
    df_eval['test ap'] = apcs  # average precision
    df_eval['control accuracy'] = accs_control
    df_eval['control mcc'] = mccs_control
    df_eval['control auc'] = aucs_control
    df_eval['control ap'] = apcs_control

    return df_eval
            return (np.nan)

    Confusion['F1'] = Confusion.apply(f1, axis=1)
    Scores = Confusion.loc[:, 'Recall':'F1'].mean(axis=0)
    RecallList.append(Scores[0])
    PrecisionList.append(Scores[1])
    F1List.append(Scores[2])
    """Initiate NB classifier to generate cheap points"""
    model = ComplementNB()
    featureTrain = permuData.loc[InitialIndices, 'X1':'X12'].values
    labelTrain = permuData.loc[InitialIndices, 'Label'].values
    model.fit(featureTrain, labelTrain)
    """Generate Cheap points"""
    ExcludedIndices = np.setdiff1d(np.array(permuData.index), InitialIndices)
    CheapPoint = model.predict(permuData.loc[ExcludedIndices, 'X1':'X12'])
    Prob = model.predict_proba(permuData.loc[ExcludedIndices, 'X1':'X12'])
    permuData.loc[ExcludedIndices, 'PredictLabel'] = CheapPoint
    permuData.loc[ExcludedIndices, 'Plow':'Phigh'] = Prob

    Indicator.append(np.nan)
    """Looping repeat until meet the termination criteria"""
    for i in range(79):

        # Binning cheap points into three categories: HHpoint, Uncertain point, LHpoint
        Uncertain = permuData[(permuData.Plow >= 0.4)
                              & (permuData.Plow <= 0.6)]
        UncertainList.append(len(Uncertain))
        HclassH = permuData[(permuData.PredictLabel == 1)
                            & (permuData.Phigh > 0.6)]
        HclassList.append(len(HclassH))
        LclassH = permuData[(permuData.PredictLabel == 0)
Example #14
0
class ProbabilisticValidator():
    """
    # The probabilistic validator is a quick to train model used for validating the predictions of our main model
    # It is fit to the results our model gets on the validation set
    """
    _smoothing_factor = 0.5 # TODO: Autodetermine smotthing factor depending on the info we know about the dataset
    _value_bucket_probabilities = {}
    _probabilistic_model = None
    X_buff = None
    Y_buff = None


    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        # <--- Pick one of the 3
        self._probabilistic_model = ComplementNB(alpha=self._smoothing_factor)
        #, class_prior=[0.5,0.5]
        #self._probabilistic_model = GaussianNB(var_smoothing=1)
        #self._probabilistic_model = MultinomialNB(alpha=self._smoothing_factor)
        self.X_buff = []
        self.Y_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]
        else:
            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {

        }

    def register_observation(self, features_existence, real_value, predicted_value):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        nr_missing_features = len([x for x in features_existence if x is False or x is 0])

        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            if nr_missing_features == 0:
                if predicted_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[predicted_value_b] = []
                self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
            self.Y_buff.append(real_value_b == predicted_value_b)

    def get_accuracy_histogram(self):
        x = []
        y = []

        total_correct = 0
        total_vals = 0

        for bucket in self.bucket_accuracy:
            total_correct += sum(self.bucket_accuracy[bucket])
            total_vals += len(self.bucket_accuracy[bucket])
            x.append(bucket)
            y.append(sum(self.bucket_accuracy[bucket])/len(self.bucket_accuracy[bucket]))

        validation_set_accuracy = total_correct/total_vals
        return {
            'buckets': x
            ,'accuracies': y
        }, validation_set_accuracy


    def partial_fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')

        if self.buckets is not None:
            self._probabilistic_model.partial_fit(self.X_buff, self.Y_buff, classes=self.bucket_keys)
        else:
            self._probabilistic_model.partial_fit(self.X_buff, self.Y_buff, classes=[True, False])

        np.seterr(divide=log_types['divide'])

        self.X_buff= []
        self.Y_buff= []

    def fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')
        self._probabilistic_model.fit(self.X_buff, self.Y_buff)
        np.seterr(divide=log_types['divide'])

        self.X_buff= []
        self.Y_buff= []

    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        """
        # Fit the probabilistic validator on an observation    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        #X = [[predicted_value_b, *features_existence]]
        log_types = np.seterr()
        np.seterr(divide='ignore')
        distribution = self._probabilistic_model.predict_proba(np.array(X))
        np.seterr(divide=log_types['divide'])

        if self.buckets is not None:
            return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
        else:
            return distribution[0][1]
Example #15
0
def code(inputs, clsf, lng, level):

    # UPDATE: 08 September 2020 --------------------------------------------
    # Irina demanded that feedbacks should have priority in coding
    # to allow "faster learning" (which is basically not true technically)
    # this PACH is just to avoid further explanations
    feedback_outputs = code_from_feedbacks(inputs, clsf, lng, level)

    # ---------------------------------------------------------------------

    # keep original entry without modifications
    # needed for nltk dict fun later
    inputs_original_lng = inputs

    # training data file and training data
    # detect in which language the data should be

    try:
        rules = CodingRules.objects.get(classification=clsf)
        max_level = rules.max_level

        # language of training data
        languages = json.loads(rules.languages)

        if 'any' in languages:
            td_file_lng = languages['any']
        else:
            td_file_lng = languages[lng]

        # transcode from another classification after coding
        # or use training data available for original
        if rules.recode_from != "this":
            later_trans_to = clsf
            clsf = rules.recode_from

        # cannot go deeper than max_level
        if level > max_level:
            level = max_level

        classification = Classification.objects.get(reference=clsf)
    except:
        # no given rule defined
        # this will result in an error
        return []

    # list of codes corresponding to classification (in try)
    codes = Code.objects.filter(parent=classification)

    # loading only data that is in the requested language
    tdf = TrainingDataFile.objects.filter(classification=clsf,
                                          language=td_file_lng)

    # well, its possible that their is no data
    # for the selected classfication scheme :)
    if len(tdf) == 0:
        return []

    # well, this is funny part
    # if lng of inputs is not equal to td_file_lng
    # defined in Coding_Rules, then it must be
    # translated to td_file_lng
    # this is how we avoid that coding for some
    # language would not work because of lack of data
    if lng != td_file_lng:

        from_lng = lng
        if lng == 'ge':
            from_lng = 'de'

        translator = translate.Translator(from_lang=from_lng,
                                          to_lang=td_file_lng)

        for i in range(0, len(inputs)):
            inputs[i] = translator.translate(inputs[i])

    # 1. tokenization
    # 2. clean stop words
    # 3. lemmatize
    # all that defined in function prepare_input
    # that takes inputs and lng args provided here
    inputs = prepare_input(inputs, td_file_lng)

    # vectorizer to transform data into td-idf
    tf = TfidfVectorizer(analyzer="word",
                         ngram_range=(1, 2),
                         min_df=0,
                         sublinear_tf=True)

    # filter data
    train = TrainingData.objects.filter(parent__in=tdf, level=level)
    train_text = [unidecode(t.text) for t in train]
    train_codes = [t.code for t in train]

    # training data collected through feedbacks
    feedbacks = Feedback.objects.filter(classification=clsf,
                                        language=td_file_lng,
                                        level=level)
    train_text = train_text + [fb.text for fb in feedbacks]
    train_codes = train_codes + [fb.code for fb in feedbacks]

    X = tf.fit_transform(train_text)

    # finally, model
    # complement naive bayes
    model = ComplementNB()
    model.fit(X, train_codes)
    inputs = [unidecode(i) for i in inputs]
    inputs_tf = tf.transform(inputs)
    output = model.predict(inputs_tf)

    # get probabilities of intput belonging to any class
    # append other likely predictions
    # then if necessary run dictionaries
    # dictionary only if max(prob)-min(prob) == 0
    probs = model.predict_proba(inputs_tf)
    classes = model.classes_

    inputs_retrial = extend_inputs_dict(inputs_original_lng, probs, lng,
                                        td_file_lng)

    if len(inputs_retrial) > 0:
        inputs_retrial = [unidecode(i) for i in inputs_retrial]
        inputs_retrial_tf = tf.transform(inputs_retrial)

        # now return predictions for those for which prob == 0
        outputs_retrial = model.predict(inputs_retrial_tf)
        outputs_retrial = outputs_retrial.tolist()

        # here we remove those with probs == 0
        # if even dictionary has zero prob then no need to keep false code
        probs_retrial = model.predict_proba(inputs_retrial_tf)
        for i in range(0, len(probs_retrial)):
            dif = max(probs_retrial[i]) - min(probs_retrial[i])

            if dif == 0:
                outputs_retrial[i] = None

        for i in range(0, len(probs)):
            dif = max(probs[i]) - min(probs[i])

            if dif == 0:
                output[i] = outputs_retrial.pop(0)

    # now if the training dataset was not
    # available for the classification against
    # which the data was coded
    # we must transcode it to that classification
    if rules.recode_from != 'this':
        try:
            crosswalk_file = CrosswalkFile.objects.get(
                classification_1=clsf, classification_2=later_trans_to)
            crosswalk = Crosswalk.objects.filter(parent=crosswalk_file)
        except:
            return []

        # recode
        re_outputs = []

        for code in output:
            recodes = crosswalk.filter(code_1=code)
            recodes = [recode.code_2 for recode in recodes]
            re_outputs.append(recodes)

        # add results obtained through coding from feedbacks (only! -> PATCH)
        for i in range(0, len(re_outputs)):
            re_outputs[i] = feedback_outputs[i] + re_outputs[i]
        return re_outputs

    output = [[out] for out in output]
    for i in range(0, len(output)):
        output[i] = feedback_outputs[i] + output[i]

    return output
def bayesClassify(trainX, trainY, testX):
    nbCLF = ComplementNB(alpha=44.5, norm=False)
    nbCLF.fit(np.absolute(trainX), trainY)
    return nbCLF.predict(testX), nbCLF.predict_proba(testX)
Example #17
0
def complement_bayes(x_train, x_test, y_train, y_test, x_train_1, y_train_1, X,
                     fl, amostra_paci3, fl_a3, nome):
    amostra_paci3_n = amostra_paci3.copy()
    x_train = preprocessing.normalize(x_train)
    x_train_1 = preprocessing.normalize(x_train_1)
    x_test = preprocessing.normalize(x_test)

    Complement = ComplementNB()
    Complement.fit(x_train, y_train)
    print(Complement.fit(x_train, y_train).feature_log_prob_)
    pred = Complement.predict_proba(x_train)
    amostra_ = Complement.predict_proba(amostra_paci3)
    amostra_2 = Complement.predict(amostra_paci3)
    amostra_paci3['result'] = 0
    amostra_paci3['probls'] = 0
    amostra_paci3['probls'] = amostra_
    amostra_paci3['result'] = amostra_2
    amostra_paci3['fl_severidade'] = fl_a3
    amostra_paci3.to_csv('modelo_complement_bayes.csv')
    print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train, pred[:, 1])))
    pred_2 = Complement.predict_proba(x_test)
    print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test, pred_2[:, 1])))
    yhat = Complement.predict_proba(x_train)
    yhat = yhat[:, 1]
    print(pd.crosstab(y_train, Complement.predict(x_train)))
    print(classification_report(y_train, Complement.predict(x_train)))
    print('AUC: %0.2f' % roc_auc_score(y_train, yhat))
    plot_roc_curve(y_train, yhat, 'naive_train')
    yhat = Complement.predict_proba(x_test)
    yhat = yhat[:, 1]
    print(pd.crosstab(y_test, Complement.predict(x_test)))
    print(classification_report(y_test, Complement.predict(x_test)))
    print('AUC: %0.2f' % roc_auc_score(y_test, yhat))
    plot_roc_curve(y_test, yhat, 'naive_test')

    print('verificação com down em treino')
    print()
    Complement = ComplementNB()
    Complement.fit(x_train_1, y_train_1)
    print(Complement.fit(x_train_1, y_train_1).feature_log_prob_)
    pred = Complement.predict_proba(x_train_1)
    amostra_ = Complement.predict_proba(amostra_paci3_n)
    amostra_2 = Complement.predict(amostra_paci3_n)
    amostra_paci3_n['result'] = 0
    amostra_paci3_n['probls'] = 0
    amostra_paci3_n['probls'] = amostra_
    amostra_paci3_n['result'] = amostra_2
    amostra_paci3_n['fl_severidade'] = fl_a3
    amostra_paci3_n.to_csv('modelo_complement_bayes_1.csv')
    print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train_1, pred[:,
                                                                        1])))
    pred_2 = Complement.predict_proba(x_test)
    print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test, pred_2[:, 1])))
    yhat = Complement.predict_proba(x_train_1)
    yhat = yhat[:, 1]
    print(pd.crosstab(y_train_1, Complement.predict(x_train_1)))
    print(classification_report(y_train_1, Complement.predict(x_train_1)))
    print('AUC: %0.2f' % roc_auc_score(y_train_1, yhat))
    plot_roc_curve(y_train_1, yhat, 'naive_train_1')
    yhat = Complement.predict_proba(x_test)
    yhat = yhat[:, 1]
    print(pd.crosstab(y_test, Complement.predict(x_test)))
    print(classification_report(y_test, Complement.predict(x_test)))
    print('AUC: %0.2f' % roc_auc_score(y_test, yhat))
    plot_roc_curve(y_test, yhat, 'naive_test_1')
# describes info about train and test set
print("Number of rows/columns in X_test dataset: ", X_test.shape)
print("Number of rows/columns in y_test dataset: ", y_test.shape)
print("Number of rows/columns in X_train dataset: ", X_train.shape)
print("Number of rows/columns in y_train dataset: ", y_train.shape)

# ## Fit the model

# In[22]:

# class sklearn.naive_bayes.ComplementNB(alpha=1.0, fit_prior=True, class_prior=None, norm=False)

NB = ComplementNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
probs = NB.predict_proba(X_test)
probs = probs[:, 1]

# ## Print the accuracy reports and confusion matrix

# In[23]:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# # Dealing with unbalanced data:
#
# ## The data is unbalanced, indicted by two things:
accuracy_MNB_14 = metrics.accuracy_score(
    Y_test, BNB_model_ti_14.predict(x_test_vec_tfidf_14))

MNB_model_ti_16 = MultinomialNB().fit(x_train_vec_tfidf_16, Y_train)
preds_MNB_tfidf_16 = MNB_model_ti_16.predict_proba(x_test_vec_tfidf_16)
accuracy_MNB_16 = metrics.accuracy_score(
    Y_test, BNB_model_ti_16.predict(x_test_vec_tfidf_16))

MNB_model_ti_610 = MultinomialNB().fit(x_train_vec_tfidf_610, Y_train)
preds_MNB_tfidf_610 = MNB_model_ti_610.predict_proba(x_test_vec_tfidf_610)
accuracy_MNB_610 = metrics.accuracy_score(
    Y_test, BNB_model_ti_610.predict(x_test_vec_tfidf_610))

# Complement Naive Bayes models
CNB_model_ti_12 = ComplementNB().fit(x_train_vec_tfidf_12, Y_train)
preds_CNB_tfidf_12 = CNB_model_ti_12.predict_proba(x_test_vec_tfidf_12)
accuracy_CNB_12 = metrics.accuracy_score(
    Y_test, CNB_model_ti_12.predict(x_test_vec_tfidf_12))

CNB_model_ti_14 = ComplementNB().fit(x_train_vec_tfidf_14, Y_train)
preds_CNB_tfidf_14 = CNB_model_ti_14.predict_proba(x_test_vec_tfidf_14)
accuracy_CNB_14 = metrics.accuracy_score(
    Y_test, CNB_model_ti_14.predict(x_test_vec_tfidf_14))

CNB_model_ti_16 = ComplementNB().fit(x_train_vec_tfidf_16, Y_train)
preds_CNB_tfidf_16 = CNB_model_ti_16.predict_proba(x_test_vec_tfidf_16)
accuracy_CNB_16 = metrics.accuracy_score(
    Y_test, CNB_model_ti_16.predict(x_test_vec_tfidf_16))

CNB_model_ti_610 = ComplementNB().fit(x_train_vec_tfidf_610, Y_train)
preds_CNB_tfidf_610 = CNB_model_ti_610.predict_proba(x_test_vec_tfidf_610)