コード例 #1
0
ファイル: logreg.py プロジェクト: tiffen/kaggle-insults
class LogReg(object):
    '''
        logistic regression
    '''
    def __init__(self, texts, classes, nlpdict=None, scale=1, C=1.0):
        self.scale = scale
        self.l = LogisticRegression(penalty='l2', dual=True, C=C, \
                class_weight='auto')
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        vectors = self.dictionary.feature_vectors(texts)
        self.l.fit(vectors, classes)

    def classify(self, texts):
        '''
            Classify a list of texts
        '''
        vectors = self.dictionary.feature_vectors(texts)
        pred_prob = self.l.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        predictions *= self.scale
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #2
0
ファイル: logreg.py プロジェクト: ANB2/kaggle-insults
class LogReg(object):
    '''
        logistic regression
    '''
    
    def __init__(self, texts, classes, nlpdict=None, scale=1, C=1.0):
        self.scale = scale
        self.l = LogisticRegression(penalty='l2', dual=True, C=C, \
                class_weight='auto')
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        vectors = self.dictionary.feature_vectors(texts)
        self.l.fit(vectors, classes)
        
    def classify(self, texts):
        '''
            Classify a list of texts
        '''
        vectors = self.dictionary.feature_vectors(texts)
        pred_prob = self.l.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        predictions *= self.scale
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0      
        return predictions
コード例 #3
0
ファイル: naive.py プロジェクト: tiffen/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None):
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     vectors = self.dictionary.feature_vectors(texts)
     self.nb = MultinomialNB()
     self.nb.fit(vectors, classes)
コード例 #4
0
ファイル: svm.py プロジェクト: tiffen/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None):
     # TODO: add list of smileys to texts/classes
     self.svm = svm.LinearSVC(C=1000, class_weight='auto')
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     self._train(texts, classes)
コード例 #5
0
 def __init__(self, texts, classes, nlpdict):
     # TODO: add list of smileys to texts/classes
     self.s = SGDClassifier(loss='hinge', penalty='l1', shuffle=True, \
             class_weight='auto')
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     self._train(texts, classes)
コード例 #6
0
ファイル: dictknn.py プロジェクト: tiffen/kaggle-insults
 def __init__(self, texts, classes):
     # TODO: add list of smileys to texts/classes
     self.knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
     self.dictionary = NLPDict(texts=texts)
     
     vectors = self._build_vector(texts[0])[np.newaxis]
     for i in xrange(1, len(texts)):
         vectors = np.vstack((vectors, self._build_vector(texts[i])))
         
     self.knn.fit(vectors, classes)
コード例 #7
0
ファイル: logreg.py プロジェクト: tiffen/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None, scale=1, C=1.0):
     self.scale = scale
     self.l = LogisticRegression(penalty='l2', dual=True, C=C, \
             class_weight='auto')
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     vectors = self.dictionary.feature_vectors(texts)
     self.l.fit(vectors, classes)
コード例 #8
0
    def __init__(self, texts, classes, params, models):
        self.models = models
        self.params = params

        self.texts = texts
        self.classes = classes
        self.dictionary = NLPDict(texts=texts)

        classes = np.array(classes)
        texts = np.array(texts)

        net_inputs = []
        expected_outputs = np.array([])

        print 'get training data'
        # get some training data
        for train, test in cross_validation.StratifiedKFold(classes, 3):
            texts_train = texts[train]
            classes_train = classes[train]
            # classes_train = list(classes_train)
            texts_test = texts[test]
            classes_test = classes[test]

            net_inputs_batch = []
            for model, params in self.models:
                m = model(texts_train, classes_train, *params)
                p = np.array(m.classify(texts_test))[np.newaxis]
                if len(net_inputs_batch):
                    net_inputs_batch = np.vstack([net_inputs_batch, p])
                else:
                    net_inputs_batch = p
            net_inputs_batch = net_inputs_batch.T

            vectors = self._build_vector(texts_test[0])[np.newaxis]
            for i in xrange(1, len(texts_test)):
                vectors = np.vstack(
                    (vectors, self._build_vector(texts_test[i])))
            net_inputs_batch = np.hstack((net_inputs_batch, vectors))

            expected_outputs = \
                    np.concatenate((expected_outputs, classes_test), axis=0)
            if len(net_inputs):
                net_inputs = \
                    np.vstack((net_inputs, net_inputs_batch))
            else:
                net_inputs = net_inputs_batch

        self.svm = svm.LinearSVC(C=1, class_weight='auto')

        print 'train network'
        # train network
        self.svm.fit(net_inputs, expected_outputs)
コード例 #9
0
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.rfc = RandomForestClassifier(n_estimators=50)
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)

        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))

        self.rfc.fit(vectors, classes)
コード例 #10
0
ファイル: dictgbc.py プロジェクト: tiffen/kaggle-insults
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.gbc = GradientBoostingClassifier(max_depth=3, learn_rate=0.1)
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)

        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))

        self.gbc.fit(vectors, classes)
コード例 #11
0
ファイル: svm.py プロジェクト: shobhit6993/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None):
     # TODO: add list of smileys to texts/classes
     self.svm = svm.LinearSVC(C=1000, class_weight='auto')
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     self._train(texts, classes)
コード例 #12
0
ファイル: naive.py プロジェクト: ANB2/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None):
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     vectors = self.dictionary.feature_vectors(texts)
     self.nb = MultinomialNB()
     self.nb.fit(vectors, classes)
コード例 #13
0
ファイル: sgdc.py プロジェクト: alabarga/kaggle-insults
 def __init__(self, texts, classes, nlpdict):
     # TODO: add list of smileys to texts/classes
     self.s = SGDClassifier(loss="hinge", penalty="l1", shuffle=True, class_weight="auto")
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     self._train(texts, classes)
コード例 #14
0
ファイル: logreg.py プロジェクト: ANB2/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None, scale=1, C=1.0):
     self.scale = scale
     self.l = LogisticRegression(penalty='l2', dual=True, C=C, \
             class_weight='auto')
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     vectors = self.dictionary.feature_vectors(texts)
     self.l.fit(vectors, classes)
コード例 #15
0
class RFC(object):
    def __init__(self, texts, classes):
        self.dictionary = NLPDict(texts=texts)
        vectors = self.dictionary.feature_vectors(texts)
        self.nb = MultinomialNB()
        self.nb.fit(vectors, classes)

    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        pred_prob = self.nb.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        #predictions *= 0.75
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #16
0
ファイル: rfc.py プロジェクト: ANB2/kaggle-insults
class RFC(object):
    
    def __init__(self, texts, classes):
        self.dictionary = NLPDict(texts=texts)
        vectors = self.dictionary.feature_vectors(texts)
        self.nb = MultinomialNB()
        self.nb.fit(vectors, classes)
        
    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        pred_prob = self.nb.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        #predictions *= 0.75
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0        
        return predictions
コード例 #17
0
ファイル: sgdc.py プロジェクト: alabarga/kaggle-insults
class SGDC(object):
    def __init__(self, texts, classes, nlpdict):
        # TODO: add list of smileys to texts/classes
        self.s = SGDClassifier(loss="hinge", penalty="l1", shuffle=True, class_weight="auto")
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        self._train(texts, classes)

    def _train(self, texts, classes):
        vectors = self.dictionary.feature_vectors(texts)
        self.s.fit(vectors, classes)

    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        predictions = self.s.decision_function(vectors)
        predictions = predictions / 20 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #18
0
class SGDC(object):
    def __init__(self, texts, classes, nlpdict):
        # TODO: add list of smileys to texts/classes
        self.s = SGDClassifier(loss='hinge', penalty='l1', shuffle=True, \
                class_weight='auto')
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        self._train(texts, classes)

    def _train(self, texts, classes):
        vectors = self.dictionary.feature_vectors(texts)
        self.s.fit(vectors, classes)

    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        predictions = self.s.decision_function(vectors)
        predictions = predictions / 20 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #19
0
ファイル: svm.py プロジェクト: alabarga/kaggle-insults
class SVM(object):
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.svm = svm.LinearSVC(C=1000, class_weight="auto")
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        self._train(texts, classes)

    def _train(self, texts, classes):
        vectors = self.dictionary.feature_vectors(texts)
        self.svm.fit(vectors, classes)

    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        predictions = self.svm.decision_function(vectors)
        predictions = np.transpose(predictions)[0]
        predictions = predictions / 2 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #20
0
ファイル: svm.py プロジェクト: shobhit6993/kaggle-insults
class SVM(object):
    
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.svm = svm.LinearSVC(C=1000, class_weight='auto')
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        self._train(texts, classes)
        
    def _train(self, texts, classes):
        vectors = self.dictionary.feature_vectors(texts)
        self.svm.fit(vectors, classes)
        
    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        predictions = self.svm.decision_function(vectors)
        predictions = np.transpose(predictions)
        predictions = predictions / 2 + 0.5
        predictions = map(lambda x: 1 if x>1 else (0 if x<0 else x),predictions)
        return predictions
コード例 #21
0
ファイル: svm.py プロジェクト: tiffen/kaggle-insults
class SVM(object):
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.svm = svm.LinearSVC(C=1000, class_weight='auto')
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        self._train(texts, classes)

    def _train(self, texts, classes):
        vectors = self.dictionary.feature_vectors(texts)
        self.svm.fit(vectors, classes)

    def classify(self, texts):
        vectors = self.dictionary.feature_vectors(texts)
        predictions = self.svm.decision_function(vectors)
        predictions = np.transpose(predictions)[0]
        predictions = predictions / 2 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #22
0
ファイル: dictgbc.py プロジェクト: ANB2/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None):
     # TODO: add list of smileys to texts/classes
     self.gbc = GradientBoostingClassifier(max_depth=3, learn_rate=0.1)
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     
     vectors = self._build_vector(texts[0])[np.newaxis]
     for i in xrange(1, len(texts)):
         vectors = np.vstack((vectors, self._build_vector(texts[i])))
         
     self.gbc.fit(vectors, classes)
コード例 #23
0
ファイル: dictrfc.py プロジェクト: ANB2/kaggle-insults
 def __init__(self, texts, classes, nlpdict=None):
     # TODO: add list of smileys to texts/classes
     self.rfc = RandomForestClassifier(n_estimators=50)
     if nlpdict:
         self.dictionary = nlpdict
     else:
         self.dictionary = NLPDict(texts=texts)
     
     vectors = self._build_vector(texts[0])[np.newaxis]
     for i in xrange(1, len(texts)):
         vectors = np.vstack((vectors, self._build_vector(texts[i])))
         
     self.rfc.fit(vectors, classes)
コード例 #24
0
ファイル: ensemblesvm.py プロジェクト: ANB2/kaggle-insults
 def __init__(self, texts, classes, params, models):
     self.models = models
     self.params = params
         
     self.texts = texts
     self.classes = classes
     self.dictionary = NLPDict(texts=texts)
     
     classes = np.array(classes)
     texts = np.array(texts)
     
     net_inputs = []
     expected_outputs = np.array([])
     
     print 'get training data'
     # get some training data
     for train, test in cross_validation.StratifiedKFold(classes, 3):
         texts_train = texts[train]
         classes_train = classes[train]
         # classes_train = list(classes_train)
         texts_test = texts[test]
         classes_test = classes[test]
         
         net_inputs_batch = []
         for model, params in self.models:
             m = model(texts_train, classes_train, *params)
             p = np.array(m.classify(texts_test))[np.newaxis]
             if len(net_inputs_batch):
                 net_inputs_batch = np.vstack([net_inputs_batch, p])
             else:
                 net_inputs_batch = p
         net_inputs_batch = net_inputs_batch.T
         
         vectors = self._build_vector(texts_test[0])[np.newaxis]
         for i in xrange(1, len(texts_test)):
             vectors = np.vstack((vectors, self._build_vector(texts_test[i])))
         net_inputs_batch = np.hstack((net_inputs_batch, vectors))
         
         expected_outputs = \
                 np.concatenate((expected_outputs, classes_test), axis=0)
         if len(net_inputs):
             net_inputs = \
                 np.vstack((net_inputs, net_inputs_batch))
         else:
             net_inputs = net_inputs_batch
 
     self.svm = svm.LinearSVC(C=1, class_weight='auto')
     
     print 'train network'
     # train network
     self.svm.fit(net_inputs, expected_outputs)
コード例 #25
0
ファイル: dictgbc.py プロジェクト: ANB2/kaggle-insults
class DictGBC(object):
    
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.gbc = GradientBoostingClassifier(max_depth=3, learn_rate=0.1)
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)
        
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
            
        self.gbc.fit(vectors, classes)
        
    def _build_vector(self, text):
        item = self.dictionary.tokenize(text)
        vector = []
        words = list(stemmed_curse_words)
        words.extend(list(you_words))
        words.sort()
        for word in words:
            freq = sum([1 for word2 in item['stemmed'] if word == word2])
            vector.append(freq)
        vector.append(item['ratio'])
        vector.append(len(item['original']))
        freq_x = sum([1 for ch in item['original'] if ch == '!'])
        freq_q = sum([1 for ch in item['original'] if ch == '?'])
        freq_a = sum([1 for ch in item['original'] if ch == '*'])
        vector.append(freq_x)
        vector.append(freq_q)
        vector.append(freq_a)
        return np.array(vector)
        
    def classify(self, texts):
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
            
        pred_prob = self.gbc.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0     
        return predictions
コード例 #26
0
class DictRFC(object):
    def __init__(self, texts, classes, nlpdict=None):
        # TODO: add list of smileys to texts/classes
        self.rfc = RandomForestClassifier(n_estimators=50)
        if nlpdict:
            self.dictionary = nlpdict
        else:
            self.dictionary = NLPDict(texts=texts)

        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))

        self.rfc.fit(vectors, classes)

    def _build_vector(self, text):
        item = self.dictionary.tokenize(text)
        vector = []
        words = list(stemmed_curse_words)
        words.extend(list(you_words))
        words.sort()
        for word in words:
            freq = sum([1 for word2 in item['stemmed'] if word == word2])
            vector.append(freq)
        vector.append(item['ratio'])
        vector.append(len(item['original']))
        freq_x = sum([1 for ch in item['original'] if ch == '!'])
        freq_q = sum([1 for ch in item['original'] if ch == '?'])
        freq_a = sum([1 for ch in item['original'] if ch == '*'])
        vector.append(freq_x)
        vector.append(freq_q)
        vector.append(freq_a)
        return np.array(vector)

    def classify(self, texts):
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))

        pred_prob = self.rfc.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #27
0
ファイル: dictknn.py プロジェクト: ANB2/kaggle-insults
class DictKNN(object):
    
    def __init__(self, texts, classes):
        # TODO: add list of smileys to texts/classes
        self.knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
        self.dictionary = NLPDict(texts=texts)
        
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
            
        self.knn.fit(vectors, classes)
        
    def _build_vector(self, text):
        item = self.dictionary.tokenize(text)
        vector = []
        words = list(stemmed_curse_words)
        words.extend(list(you_words))
        words.sort()
        for word in words:
            freq = sum([1 for word2 in item['stemmed'] if word == word2])
            vector.append(freq)
        vector.append(item['ratio'])
        vector.append(len(item['original']))
        freq_x = sum([1 for ch in item['original'] if ch == '!'])
        freq_q = sum([1 for ch in item['original'] if ch == '?'])
        freq_a = sum([1 for ch in item['original'] if ch == '*'])
        vector.append(freq_x)
        vector.append(freq_q)
        vector.append(freq_a)
        return np.array(vector)
        
    def classify(self, texts):
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
            
        pred_prob = self.knn.predict_proba(vectors)
        predictions = []
        for pair in pred_prob:
            predictions.append(pair[1] - pair[0])
        predictions = np.array(predictions)
        predictions = (predictions + 1) / 2
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0     
        return predictions
コード例 #28
0
ファイル: dictionary.py プロジェクト: tiffen/kaggle-insults
 def classify(self, texts):
     n = NLPDict(texts)
     results = []
     for item in n.data:
         text_words = set(item['stemmed'])
         if text_words.intersection(you_words) \
                 and text_words.intersection(stemmed_curse_words):
             rez = 0.5
             you_pos = [i for i, w in enumerate(item['stemmed']) \
                     if w in you_words]
             curse_pos = [i for i, w in enumerate(item['stemmed']) \
                     if w in stemmed_curse_words or \
                     re.sub(r'[^\w]', '', w) in stemmed_curse_words or \
                     sum([1 for f in freq_curse_words if f in w])]
             for p1 in you_pos:
                 for p2 in curse_pos:
                     rez += 0.13 / abs(p1 - p2)
             rez = min(rez, 1)
             results.append(rez)
         else:
             results.append(0)
     return results
コード例 #29
0
ファイル: ensemblesvm.py プロジェクト: ANB2/kaggle-insults
class EnsembleSVM(object):
    
    def __init__(self, texts, classes, params, models):
        self.models = models
        self.params = params
            
        self.texts = texts
        self.classes = classes
        self.dictionary = NLPDict(texts=texts)
        
        classes = np.array(classes)
        texts = np.array(texts)
        
        net_inputs = []
        expected_outputs = np.array([])
        
        print 'get training data'
        # get some training data
        for train, test in cross_validation.StratifiedKFold(classes, 3):
            texts_train = texts[train]
            classes_train = classes[train]
            # classes_train = list(classes_train)
            texts_test = texts[test]
            classes_test = classes[test]
            
            net_inputs_batch = []
            for model, params in self.models:
                m = model(texts_train, classes_train, *params)
                p = np.array(m.classify(texts_test))[np.newaxis]
                if len(net_inputs_batch):
                    net_inputs_batch = np.vstack([net_inputs_batch, p])
                else:
                    net_inputs_batch = p
            net_inputs_batch = net_inputs_batch.T
            
            vectors = self._build_vector(texts_test[0])[np.newaxis]
            for i in xrange(1, len(texts_test)):
                vectors = np.vstack((vectors, self._build_vector(texts_test[i])))
            net_inputs_batch = np.hstack((net_inputs_batch, vectors))
            
            expected_outputs = \
                    np.concatenate((expected_outputs, classes_test), axis=0)
            if len(net_inputs):
                net_inputs = \
                    np.vstack((net_inputs, net_inputs_batch))
            else:
                net_inputs = net_inputs_batch
    
        self.svm = svm.LinearSVC(C=1, class_weight='auto')
        
        print 'train network'
        # train network
        self.svm.fit(net_inputs, expected_outputs)
    
    def _build_vector(self, text):
        item = self.dictionary.tokenize(text)
        vector = []
        words = list(stemmed_curse_words)
        words.extend(list(you_words))
        words.sort()
        freq = 0
        for word in words:
            freq += sum([1 for word2 in item['stemmed'] if word == word2])
        vector.append(min(freq / 10.0, 1))
        vector.append(item['ratio'])
        vector.append(min(len(item['original']) / 500.0, 1))
        freq_x = sum([1 for ch in item['original'] if ch == '!'])
        freq_q = sum([1 for ch in item['original'] if ch == '?'])
        freq_a = sum([1 for ch in item['original'] if ch == '*'])
        vector.append(min(1, freq_x / 10.0))
        vector.append(min(1, freq_q / 10.0))
        vector.append(min(1, freq_a / 10.0))
        return np.array(vector)
    
    def classify(self, texts):
        print 'classify'
        net_inputs = []
        for model, params in self.models:
            m = model(self.texts, self.classes, *params)
            p = np.array(m.classify(texts))[np.newaxis]
            if len(net_inputs):
                net_inputs = np.vstack([net_inputs, p])
            else:
                net_inputs = p
        net_inputs = net_inputs.T
                
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
        net_inputs = np.hstack((net_inputs, vectors))
        
        predictions = self.svm.decision_function(net_inputs)
        predictions = np.transpose(predictions)[0]
        predictions = predictions / 2 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #30
0
class EnsembleSVM(object):
    def __init__(self, texts, classes, params, models):
        self.models = models
        self.params = params

        self.texts = texts
        self.classes = classes
        self.dictionary = NLPDict(texts=texts)

        classes = np.array(classes)
        texts = np.array(texts)

        net_inputs = []
        expected_outputs = np.array([])

        print 'get training data'
        # get some training data
        for train, test in cross_validation.StratifiedKFold(classes, 3):
            texts_train = texts[train]
            classes_train = classes[train]
            # classes_train = list(classes_train)
            texts_test = texts[test]
            classes_test = classes[test]

            net_inputs_batch = []
            for model, params in self.models:
                m = model(texts_train, classes_train, *params)
                p = np.array(m.classify(texts_test))[np.newaxis]
                if len(net_inputs_batch):
                    net_inputs_batch = np.vstack([net_inputs_batch, p])
                else:
                    net_inputs_batch = p
            net_inputs_batch = net_inputs_batch.T

            vectors = self._build_vector(texts_test[0])[np.newaxis]
            for i in xrange(1, len(texts_test)):
                vectors = np.vstack(
                    (vectors, self._build_vector(texts_test[i])))
            net_inputs_batch = np.hstack((net_inputs_batch, vectors))

            expected_outputs = \
                    np.concatenate((expected_outputs, classes_test), axis=0)
            if len(net_inputs):
                net_inputs = \
                    np.vstack((net_inputs, net_inputs_batch))
            else:
                net_inputs = net_inputs_batch

        self.svm = svm.LinearSVC(C=1, class_weight='auto')

        print 'train network'
        # train network
        self.svm.fit(net_inputs, expected_outputs)

    def _build_vector(self, text):
        item = self.dictionary.tokenize(text)
        vector = []
        words = list(stemmed_curse_words)
        words.extend(list(you_words))
        words.sort()
        freq = 0
        for word in words:
            freq += sum([1 for word2 in item['stemmed'] if word == word2])
        vector.append(min(freq / 10.0, 1))
        vector.append(item['ratio'])
        vector.append(min(len(item['original']) / 500.0, 1))
        freq_x = sum([1 for ch in item['original'] if ch == '!'])
        freq_q = sum([1 for ch in item['original'] if ch == '?'])
        freq_a = sum([1 for ch in item['original'] if ch == '*'])
        vector.append(min(1, freq_x / 10.0))
        vector.append(min(1, freq_q / 10.0))
        vector.append(min(1, freq_a / 10.0))
        return np.array(vector)

    def classify(self, texts):
        print 'classify'
        net_inputs = []
        for model, params in self.models:
            m = model(self.texts, self.classes, *params)
            p = np.array(m.classify(texts))[np.newaxis]
            if len(net_inputs):
                net_inputs = np.vstack([net_inputs, p])
            else:
                net_inputs = p
        net_inputs = net_inputs.T

        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
        net_inputs = np.hstack((net_inputs, vectors))

        predictions = self.svm.decision_function(net_inputs)
        predictions = np.transpose(predictions)[0]
        predictions = predictions / 2 + 0.5
        predictions[predictions > 1] = 1
        predictions[predictions < 0] = 0
        return predictions
コード例 #31
0
class Ensemble(object):
    '''
        neural network using several clasifiers as input
    '''
    
    def __init__(self, texts, classes, params, models):
        self.models = models
        self.params = params
            
        self.texts = texts
        self.classes = classes
        self.dictionary = NLPDict(texts=texts)
        
        classes = np.array(classes)
        texts = np.array(texts)
        
        net_inputs = []
        expected_outputs = np.array([])
        
        print 'get training data'
        # get some training data
        for train, test in cross_validation.StratifiedKFold(classes, 5):
            texts_train = texts[train]
            classes_train = classes[train]
            # classes_train = list(classes_train)
            texts_test = texts[test]
            classes_test = classes[test]
            
            net_inputs_batch = []
            for model, params in self.models:
                m = model(texts_train, classes_train, *params)
                p = np.array(m.classify(texts_test))[np.newaxis]
                if len(net_inputs_batch):
                    net_inputs_batch = np.vstack([net_inputs_batch, p])
                else:
                    net_inputs_batch = p
            net_inputs_batch = net_inputs_batch.T
            
            vectors = self._build_vector(texts_test[0])[np.newaxis]
            for i in xrange(1, len(texts_test)):
                vectors = np.vstack((vectors, self._build_vector(texts_test[i])))
            net_inputs_batch = np.hstack((net_inputs_batch, vectors))
            
            expected_outputs = \
                    np.concatenate((expected_outputs, classes_test), axis=0)
            if len(net_inputs):
                net_inputs = \
                    np.vstack((net_inputs, net_inputs_batch))
            else:
                net_inputs = net_inputs_batch
    
        # init network
        self.net = nl.net.newff( \
            [[0, 1] for i in range(len(self.models) + 6)], \
            self.params['structure'] \
        )
        
        print 'train network'
        # train network
        expected_outputs = expected_outputs[np.newaxis].T
        nl.train.train_rprop( \
                self.net, \
                net_inputs, \
                expected_outputs, \
                epochs=self.params['epochs'] \
        )
    
    def _build_vector(self, text):
        item = self.dictionary.tokenize(text)
        vector = []
        words = list(stemmed_curse_words)
        words.extend(list(you_words))
        words.sort()
        freq = 0
        for word in words:
            freq += sum([1 for word2 in item['stemmed'] if word == word2])
        vector.append(min(freq / 10.0, 1))
        vector.append(item['ratio'])
        vector.append(min(len(item['original']) / 500.0, 1))
        freq_x = sum([1 for ch in item['original'] if ch == '!'])
        freq_q = sum([1 for ch in item['original'] if ch == '?'])
        freq_a = sum([1 for ch in item['original'] if ch == '*'])
        vector.append(min(1, freq_x / 10.0))
        vector.append(min(1, freq_q / 10.0))
        vector.append(min(1, freq_a / 10.0))
        return np.array(vector)
    
    def classify(self, texts):
        print 'classify'
        net_inputs = []
        for model, params in self.models:
            m = model(self.texts, self.classes, *params)
            p = np.array(m.classify(texts))[np.newaxis]
            if len(net_inputs):
                net_inputs = np.vstack([net_inputs, p])
            else:
                net_inputs = p
        net_inputs = net_inputs.T
                
        vectors = self._build_vector(texts[0])[np.newaxis]
        for i in xrange(1, len(texts)):
            vectors = np.vstack((vectors, self._build_vector(texts[i])))
        net_inputs = np.hstack((net_inputs, vectors))
        
        results = self.net.sim(net_inputs)
        results = results.T[0]
        return results
コード例 #32
0
 def __init__(self, texts, classes, params, models):
     self.models = models
     self.params = params
         
     self.texts = texts
     self.classes = classes
     self.dictionary = NLPDict(texts=texts)
     
     classes = np.array(classes)
     texts = np.array(texts)
     
     net_inputs = []
     expected_outputs = np.array([])
     
     print 'get training data'
     # get some training data
     for train, test in cross_validation.StratifiedKFold(classes, 5):
         texts_train = texts[train]
         classes_train = classes[train]
         # classes_train = list(classes_train)
         texts_test = texts[test]
         classes_test = classes[test]
         
         net_inputs_batch = []
         for model, params in self.models:
             m = model(texts_train, classes_train, *params)
             p = np.array(m.classify(texts_test))[np.newaxis]
             if len(net_inputs_batch):
                 net_inputs_batch = np.vstack([net_inputs_batch, p])
             else:
                 net_inputs_batch = p
         net_inputs_batch = net_inputs_batch.T
         
         vectors = self._build_vector(texts_test[0])[np.newaxis]
         for i in xrange(1, len(texts_test)):
             vectors = np.vstack((vectors, self._build_vector(texts_test[i])))
         net_inputs_batch = np.hstack((net_inputs_batch, vectors))
         
         expected_outputs = \
                 np.concatenate((expected_outputs, classes_test), axis=0)
         if len(net_inputs):
             net_inputs = \
                 np.vstack((net_inputs, net_inputs_batch))
         else:
             net_inputs = net_inputs_batch
 
     # init network
     self.net = nl.net.newff( \
         [[0, 1] for i in range(len(self.models) + 6)], \
         self.params['structure'] \
     )
     
     print 'train network'
     # train network
     expected_outputs = expected_outputs[np.newaxis].T
     nl.train.train_rprop( \
             self.net, \
             net_inputs, \
             expected_outputs, \
             epochs=self.params['epochs'] \
     )
コード例 #33
0
 def __init__(self, texts, classes):
     self.dictionary = NLPDict(texts=texts)
     vectors = self.dictionary.feature_vectors(texts)
     self.nb = MultinomialNB()
     self.nb.fit(vectors, classes)
コード例 #34
0
ファイル: rfc.py プロジェクト: ANB2/kaggle-insults
 def __init__(self, texts, classes):
     self.dictionary = NLPDict(texts=texts)
     vectors = self.dictionary.feature_vectors(texts)
     self.nb = MultinomialNB()
     self.nb.fit(vectors, classes)