Example #1
0
def online_pe_pipeline():
    """ Online vectorizer with feature hashing
    :return:
    """
    return Pipeline([
        ('vectorize',
         FeatureUnion(transformer_list=[
             ('signatures',
              Pipeline([
                  ('vectorizer',
                   SignatureDictVectorizer(vectorizer=FeatureHasher(2048))),
              ])),
             ('header',
              Pipeline([
                  ('vectorizer', HeaderVectorizer(FeatureHasher(4096))),
              ])),
             ('sym_imports',
              Pipeline([
                  ('vectorizer',
                   SymImportsDictVectorizer(FeatureHasher(1024))),
              ])),
             ('sym_exports',
              Pipeline([
                  ('vectorizer',
                   SymExportsDictVectorizer(FeatureHasher(1024))),
              ])),
         ], )),
        ('projection',
         SparseRandomProjection(n_components=256, dense_output=True)),
    ])
Example #2
0
class FeatFunctions(object):
    """docstring for featFunctions"""

    def __init__(self, n_features=None):

        # self.arg = arg
        import re
        from sklearn.feature_extraction.text import FeatureHasher
        from numpy.random import randn, randint
        from sklearn.feature_extraction.text import CountVectorizer

        # Define some parameters:
        if not n_features:
            n_features = 100000

            # Initialize the hasher:
        self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True)

        # Initialize the ngram:
        self.vectorizer = CountVectorizer(binary=True)

        # Feature name-function dictionary:
        self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams}

    def all_caps(self, x):
        pat = re.compile(r"^[A-Z\d]+$")
        groups = pat.match(x)
        if groups:
            return ["f_all_caps"]

    def url(self, x):
        pat = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
        groups = pat.findall(x)
        if groups:
            return ["f_url"]

    def ngrams(self, x):
        ngram_feats = self.vectorizer.fit_transform([x])
        return self.vectorizer.inverse_transform(ngram_feats)[0].tolist()

        # An observation function that extracts features. x is a raw text

    def getObsFeatures(self, x, feat_list):
        str_feats = []
        for feat in feat_list:
            feat = feat(x)
            if feat:
                str_feats += feat

        return str_feats

    def getYXFeatures(self, y_name, y_idx, obs_feat_list):
        # return y_name+'_'+str(y_idx).join(obs_feat_list)
        # return map(lambda x,y:x+y,y_name+'_'+str(y_idx),obs_feat_list)
        xy_feat = [y_name + str(y_idx) + "_" + xfeat for xfeat in obs_feat_list]
        # print xy_feat

        hashed_feats = self.hasher.transform([xy_feat])
        # return hashed_feats.nonzero()[1]
        return hashed_feats
    def trainClassifier(self):
        try:
            t1 = time()
            start_time = time()
            self.hasher = FeatureHasher(input_type='string', non_negative=True)
            self.clf = SVC(probability=True, C=5., gamma=0.001)

            data_folder = self.root_dir + "/training_data"
            train_dataset = load_files(data_folder)

            print("Time taken to load the data=>", time() - start_time)
            cache = dict(train=train_dataset)
            self.data_train = self.fetch_data(cache, subset='train')
            try:
                # print data_train.target_names
                print "Loading pickle"
                self.clf = pickle.load(open("model.pickle", "rb"))
                print "trained and ready ..."
            except:
                import traceback
                print traceback.format_exc()
                print "Generating pickles"
                training_data = []
                for text in self.data_train.data:
                    text = text.decode('utf-8', 'ignore')
                    training_data.append(text)
                raw_X = (self.token_ques(text) for text in training_data
                         )  #Type of raw_X  <type 'generator'>
                X_train = self.hasher.fit_transform(raw_X)
                y_train = self.data_train.target
                self.clf.fit(X_train, y_train)
                readselfclf = open('model.pickle', 'wb')
                pickle.dump(self.clf, readselfclf)
                readselfclf.close()
                print "Training ended"
                print("Classifier trained ...")
                print("time taken=>", time() - t1)

        except Exception:
            import traceback
            print traceback.format_exc()
Example #4
0
    def __init__(self, n_features=None):

        # self.arg = arg
        import re
        from sklearn.feature_extraction.text import FeatureHasher
        from numpy.random import randn, randint
        from sklearn.feature_extraction.text import CountVectorizer

        # Define some parameters:
        if not n_features:
            n_features = 100000

            # Initialize the hasher:
        self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True)

        # Initialize the ngram:
        self.vectorizer = CountVectorizer(binary=True)

        # Feature name-function dictionary:
        self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams}
Example #5
0
def trainClassifier(batchSize, dataFolder, clfFolderName, tagsSplitSize):
    startTime = time()
    if not os.path.exists(clfFolderName):
        os.makedirs(clfFolderName)
    if not os.path.exists(clfFolderName + 'Temp'):
        os.makedirs(clfFolderName + 'Temp')
    tags = list(USED_TAGS.keys())
    totalRows = getTotalRows('data/' + dataFolder + '/TrainIds')

    hasher = FeatureHasher()
    batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows)
    hashInd = 1
    print 'number of tags : ' + str(len(tags))
    extractor = FeatureExtractor()
    for _, X, _ in batchGen:
        batchTime = time()
        print 'computing batch : ' + str(hashInd)
        X_batch = hasher.transform(extractor.extract(sample) for sample in X)
        print 'saving batch : ' + str(hashInd)
        with open(clfFolderName + 'Temp/' + str(hashInd) + '.pkl',
                  'wb') as fid:
            cPickle.dump(X_batch, fid)
        print 'batch time : ' + str(time() - batchTime)
        hashInd += 1
    with open(clfFolderName + '/hasher.pkl', 'wb') as fid:
        cPickle.dump(hasher, fid)
    with open(clfFolderName + '/extractor.pkl', 'wb') as fid:
        cPickle.dump(extractor, fid)
    print 'hashing time : ' + str(time() - startTime)

    tagIndDic = {}
    tagInd = 1
    loop = 1
    for currTags in [
            tags[i:i + tagsSplitSize]
            for i in range(0, len(tags), tagsSplitSize)
    ]:
        iterStartTime = time()
        print 'tags iteration : ' + str(loop)
        clfDic = {}
        for tag in currTags:
            clfDic[tag] = Perceptron(alpha=ALPHA, n_iter=N_ITER)
        batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows)
        batchInd = 1
        for _, _, targets_in_batch in batchGen:
            batchTime = time()
            print 'batch number : ' + str(batchInd)
            with open(clfFolderName + 'Temp/' + str(batchInd) + '.pkl',
                      'rb') as fp:
                X_batch = cPickle.load(fp)
            for tag in currTags:
                Y_batch_binary = toBinary(tag, targets_in_batch)
                clfDic[tag].partial_fit(X_batch,
                                        Y_batch_binary,
                                        classes=[0, 1])
            batchInd += 1
            print 'batch time : ' + str(time() - batchTime)
        for tag in clfDic:
            clfDic[tag].sparsify()
            tagIndDic[tag] = tagInd
            with open(clfFolderName + '/' + str(tagInd) + '.pkl', 'wb') as fid:
                cPickle.dump(clfDic[tag], fid)
            tagInd += 1
        loop += 1
        print 'iter time : ' + str(time() - iterStartTime)
        print
    print 'saving model...'
    with open(clfFolderName + '/tagIndDic.pkl', 'wb') as fid:
        cPickle.dump(tagIndDic, fid)

    print 'total time : ' + str(time() - startTime)
Example #6
0
class IssueClassification(object):
    """
    Init for complain classification
    """
    def __init__(self):
        # self.gm_worker = gearman.GearmanWorker(['localhost:4730'])
        # self.gm_worker.register_task('test_svm_rumour_classifier', self.testClassifier)
        self.root_dir = os.getcwd()
        self.trainClassifier()
        
    """
    Function to fetch the data from cache
    @cache  <dict>  consist of training data
    """
    def fetch_data(self, cache, data_home=None, subset='train', categories=None,
                       shuffle=True, random_state=42):
        if subset in ('train', 'test'):
            data = cache[subset]
        else:
            raise ValueError(
                "subset can only be 'train', 'test' or 'all', got '%s'" % subset) 
        if shuffle:
            random_state = check_random_state(random_state)
            indices = np.arange(data.target.shape[0])
            random_state.shuffle(indices)
            data.filenames = data.filenames[indices]
            data.target = data.target[indices]
            # Use an object array to shuffle: avoids memory copy
            data_lst = np.array(data.data, dtype=object)
            data_lst = data_lst[indices]
            data.data = data_lst.tolist()
        return data
    
    """
    For custom tokenizing the text, removed stop words from text
    @text   <type 'str'>    text which needs to get tokenized
    @return <type 'str'>    tokens
    """
    def token_ques(self, text):
        things_to_replace = ['?']
        things_to_replace += stopwords.words('english')
        #wh_word = None
        for tok in text.split('\n'):
            original_query = tok
            # 1. Stemming
            # 2. POS consideration verb, adjectives
            query_pos_tags = nltk.pos_tag(word_tokenize(tok))     
            for word in things_to_replace:
                tok = tok.lower()
                tok = tok.strip("  ")
            for word in word_tokenize(tok):
                yield word.lower()
    
    """
    Train classifier
    """
    def trainClassifier(self):
        try:
            t1 = time()
            start_time = time()
            self.hasher = FeatureHasher(input_type='string')
            self.clf =  SVC(probability=True,C=5., gamma=0.001)
            
            data_folder = self.root_dir + "/training_data_issue"
            train_dataset = load_files(data_folder)
                   
            cache = dict(train=train_dataset)
            self.data_train = self.fetch_data(cache, subset='train')
            try:
                self.clf = pickle.load(open("model_issue.pickle", "rb" ) )
            except:
                import traceback
                print traceback.format_exc()
                print "Generating pickles"
                training_data = []
                for text in self.data_train.data:
                    text = text.decode('utf-8','ignore')
                    training_data.append(text)
                raw_X = (self.token_ques(text) for text in training_data)  #Type of raw_X  <type 'generator'>
                X_train = self.hasher.fit_transform(raw_X)
                y_train = self.data_train.target      
                self.clf.fit(X_train, y_train)
                readselfclf = open('model_issue.pickle', 'wb')
                pickle.dump(self.clf, readselfclf)
                readselfclf.close()
                print "Training ended"
                print("Classifier trained ...")
                print("time taken=>", time()-t1)
                
        except Exception:
            import traceback
            print traceback.format_exc()
            
    """
    Function to test classifier
    """
    def testClassifier(self, record):
        try:
            query = json.loads(record)
            # return json.dumps(lookup_result)
            query = query['complain']
            result = {}
            test_data = [query]
            raw_X = (self.token_ques(text) for text in test_data)
            X_test = self.hasher.fit_transform(raw_X)
            pred = self.clf.predict(X_test)
            #print("pred=>", pred)
            self.categories = self.data_train.target_names
            index = 1
            predict_prob = self.clf.predict_proba(X_test)
            for doc, category_list in zip(test_data, predict_prob):
                # print('\n\n')
                category_list = sorted(enumerate(category_list), key=lambda x:x[1], reverse=True)
                i = 0
                for val in category_list:
                    #print('%r => %s => %0.2f' % (doc, self.categories[val[0]], (float(val[1]) * 100)))
                    result[self.categories[val[0]]] = val[1] * 100
        except Exception:
            import traceback
            print traceback.format_exc()
        return result	
        
    def testSingleRecord(self):
        while True:
            result = {}
            print "\n Enter description"
            desciption = raw_input()
            result['complain']= desciption
            rec_result = self.testClassifier(json.dumps( result ))
class ArticleClassification(object):
    """
    Init for complain classification
    """
    def __init__(self):
        # self.gm_worker = gearman.GearmanWorker(['localhost:4730'])
        # self.gm_worker.register_task('test_svm_rumour_classifier', self.testClassifier)
        self.root_dir = os.getcwd()
        self.trainClassifier()

    """
    Function to fetch the data from cache
    @cache  <dict>  consist of training data
    """

    def fetch_data(self,
                   cache,
                   data_home=None,
                   subset='train',
                   categories=None,
                   shuffle=True,
                   random_state=42):
        if subset in ('train', 'test'):
            data = cache[subset]
        else:
            raise ValueError(
                "subset can only be 'train', 'test' or 'all', got '%s'" %
                subset)
        if shuffle:
            random_state = check_random_state(random_state)
            indices = np.arange(data.target.shape[0])
            random_state.shuffle(indices)
            data.filenames = data.filenames[indices]
            data.target = data.target[indices]
            # Use an object array to shuffle: avoids memory copy
            data_lst = np.array(data.data, dtype=object)
            data_lst = data_lst[indices]
            data.data = data_lst.tolist()
        return data

    """
    For custom tokenizing the text, removed stop words from text
    @text   <type 'str'>    text which needs to get tokenized
    @return <type 'str'>    tokens
    """

    def token_ques(self, text):
        things_to_replace = ['?']
        things_to_replace += stopwords.words('english')
        #wh_word = None
        for tok in text.split('\n'):
            original_query = tok
            # 1. Stemming
            # 2. POS consideration verb, adjectives
            query_pos_tags = nltk.pos_tag(word_tokenize(tok))
            for word in things_to_replace:
                tok = tok.lower()
                tok = tok.strip("  ")
            for word in word_tokenize(tok):
                yield word.lower()

    """
    Train classifier
    """

    def trainClassifier(self):
        try:
            t1 = time()
            start_time = time()
            self.hasher = FeatureHasher(input_type='string', non_negative=True)
            self.clf = SVC(probability=True, C=5., gamma=0.001)

            data_folder = self.root_dir + "/training_data"
            train_dataset = load_files(data_folder)

            print("Time taken to load the data=>", time() - start_time)
            cache = dict(train=train_dataset)
            self.data_train = self.fetch_data(cache, subset='train')
            try:
                # print data_train.target_names
                print "Loading pickle"
                self.clf = pickle.load(open("model.pickle", "rb"))
                print "trained and ready ..."
            except:
                import traceback
                print traceback.format_exc()
                print "Generating pickles"
                training_data = []
                for text in self.data_train.data:
                    text = text.decode('utf-8', 'ignore')
                    training_data.append(text)
                raw_X = (self.token_ques(text) for text in training_data
                         )  #Type of raw_X  <type 'generator'>
                X_train = self.hasher.fit_transform(raw_X)
                y_train = self.data_train.target
                self.clf.fit(X_train, y_train)
                readselfclf = open('model.pickle', 'wb')
                pickle.dump(self.clf, readselfclf)
                readselfclf.close()
                print "Training ended"
                print("Classifier trained ...")
                print("time taken=>", time() - t1)

        except Exception:
            import traceback
            print traceback.format_exc()

    """
    Function to test classifier
    """

    def testClassifier(self, record):
        try:
            query = json.loads(record)
            # return json.dumps(lookup_result)
            query = query['complain']
            result = {}
            test_data = [query]
            raw_X = (self.token_ques(text) for text in test_data)
            X_test = self.hasher.fit_transform(raw_X)
            pred = self.clf.predict(X_test)
            #print("pred=>", pred)
            self.categories = self.data_train.target_names
            index = 1
            predict_prob = self.clf.predict_proba(X_test)
            for doc, category_list in zip(test_data, predict_prob):
                # print('\n\n')
                category_list = sorted(enumerate(category_list),
                                       key=lambda x: x[1],
                                       reverse=True)
                i = 0
                for val in category_list:
                    #print('%r => %s => %0.2f' % (doc, self.categories[val[0]], (float(val[1]) * 100)))
                    result[self.categories[val[0]]] = val[1] * 100
        except Exception:
            import traceback
            print traceback.format_exc()
        print result
        print "process ends here"

    def readAndCall(self):
        import pdb
        # pdb.set_trace()
        t1 = time()
        start_time = time()
        data_file = open('June30_data.txt')
        input_data = data_file.read().split("\n")
        total = 0
        sus_count = 0
        sus_count_clone = 0
        sus_count_dist = 0
        sus_count_dup = 0
        unsus_count = 0
        processed_title = []
        print "Started reading mongo data"
        for i in range(0, (len(input_data) - 1)):
            print i
            result = {}
            mydata = input_data[i].split("\t(")
            result['title'] = mydata[0]
            result['description'] = mydata[1]
            # completed_job_request = self.gm_client.submit_job('test_svm_rumour_classifier', json.dumps( result )  ,wait_until_complete=True)
            rec_result = self.testClassifier(json.dumps(result))
            # print rec_result
            rec_result = json.loads(rec_result)
            total += 1

            if rec_result['result']['suspected'] > rec_result['result'][
                    'unsuspected']:
                print "suspected :", sus_count
                sus_count += 1
                dup_flag = 0
                for title in processed_title:
                    if fuzz.ratio(rec_result['title'], title) > 80:
                        sus_count_dup += 1
                        dup_flag = 1
                        print "duplicate : ", rec_result['title']
                        break
                    # if dup_flag == 0:
                    #     sus_count_dist += 1
                processed_title.append(rec_result['title'])

            else:
                unsus_count += 1
        print "Processed title : ", processed_title
        print "Result for June 30"
        print "Total :", total
        print "Suspected : ", sus_count
        print "duplicates : ", sus_count_dup
        print "Unsuspected : ", unsus_count

    def testSingleRecord(self):
        while True:
            result = {}
            print "\n Enter description"
            desciption = raw_input()
            result['complain'] = desciption
            rec_result = self.testClassifier(json.dumps(result))
Example #8
0
    # X = crfutils.get_features(feature_extractor, fields=fields, sep=separator)
    # doc = []
    Y = set()
    for x in X:
        for entry in x:
            Y.add(entry['y'])
            # doc.append(entry['F'])
    # return X, hf.transform(doc)
    return X, list(Y)


if __name__ == '__main__':
    # pass
    # crfutils.main(feature_extractor, fields=fields, sep=separator)
    X = crfutils.get_features(feature_extractor, fields=fields, sep=separator)

    # Apply the hashing trick
    hf = FeatureHasher(input_type='string',non_negative=True)
    # # List of dictionaries:
    # x_set = set()
    # # Iterate over each of the tokens features:

    doc = []
    for x in X:
        # sg_tv.transform(x)
        for entry in x:
            # print entry['F']
            doc+=entry['F']
        # vec = sg_tv.transform(doc)
        print hf.transform(doc)
Example #9
0
def trainClassifier(batchSize,dataFolder,clfFolderName,tagsSplitSize):
    startTime = time()
    if not os.path.exists(clfFolderName):
        os.makedirs(clfFolderName)
    if not os.path.exists(clfFolderName+'Temp'):
        os.makedirs(clfFolderName+'Temp')
    tags = list(USED_TAGS.keys())
    totalRows = getTotalRows('data/'+dataFolder+'/TrainIds')
     
    hasher = FeatureHasher()
    batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows)   
    hashInd = 1
    print 'number of tags : ' + str(len(tags))
    extractor = FeatureExtractor()
    for _,X,_ in batchGen:
        batchTime = time()
        print 'computing batch : ' + str(hashInd)
        X_batch = hasher.transform(extractor.extract(sample) for sample in X)
        print 'saving batch : ' + str(hashInd)
        with open(clfFolderName+'Temp/'+str(hashInd)+'.pkl', 'wb') as fid:
                cPickle.dump(X_batch, fid)
        print 'batch time : ' + str(time()-batchTime)
        hashInd+=1
    with open(clfFolderName+'/hasher.pkl', 'wb') as fid:
        cPickle.dump(hasher, fid)
    with open(clfFolderName+'/extractor.pkl', 'wb') as fid:
        cPickle.dump(extractor, fid)
    print 'hashing time : ' + str(time()-startTime)
    
    tagIndDic = {}
    tagInd = 1 
    loop = 1
    for currTags in [tags[i:i+tagsSplitSize] for i in range(0,len(tags),tagsSplitSize)]:
        iterStartTime = time()
        print 'tags iteration : ' + str(loop)
        clfDic = {}
        for tag in currTags:
            clfDic[tag] = Perceptron(alpha=ALPHA,n_iter=N_ITER)
        batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows)
        batchInd = 1
        for _,_,targets_in_batch in batchGen:
            batchTime = time()
            print 'batch number : ' + str(batchInd)
            with open(clfFolderName+'Temp/'+str(batchInd)+'.pkl','rb') as fp:
                X_batch=cPickle.load(fp)
            for tag in currTags:
                Y_batch_binary = toBinary(tag,targets_in_batch)
                clfDic[tag].partial_fit(X_batch, Y_batch_binary, classes=[0,1])
            batchInd+=1
            print 'batch time : ' + str(time()-batchTime)
        for tag in clfDic:
            clfDic[tag].sparsify()
            tagIndDic[tag]=tagInd
            with open(clfFolderName+'/'+str(tagInd)+'.pkl', 'wb') as fid:
                cPickle.dump(clfDic[tag], fid)
            tagInd+=1
        loop+=1
        print 'iter time : ' + str(time()-iterStartTime)
        print 
    print 'saving model...'
    with open(clfFolderName+'/tagIndDic.pkl', 'wb') as fid:
        cPickle.dump(tagIndDic, fid)

    print 'total time : ' + str(time()-startTime)