def online_pe_pipeline(): """ Online vectorizer with feature hashing :return: """ return Pipeline([ ('vectorize', FeatureUnion(transformer_list=[ ('signatures', Pipeline([ ('vectorizer', SignatureDictVectorizer(vectorizer=FeatureHasher(2048))), ])), ('header', Pipeline([ ('vectorizer', HeaderVectorizer(FeatureHasher(4096))), ])), ('sym_imports', Pipeline([ ('vectorizer', SymImportsDictVectorizer(FeatureHasher(1024))), ])), ('sym_exports', Pipeline([ ('vectorizer', SymExportsDictVectorizer(FeatureHasher(1024))), ])), ], )), ('projection', SparseRandomProjection(n_components=256, dense_output=True)), ])
class FeatFunctions(object): """docstring for featFunctions""" def __init__(self, n_features=None): # self.arg = arg import re from sklearn.feature_extraction.text import FeatureHasher from numpy.random import randn, randint from sklearn.feature_extraction.text import CountVectorizer # Define some parameters: if not n_features: n_features = 100000 # Initialize the hasher: self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True) # Initialize the ngram: self.vectorizer = CountVectorizer(binary=True) # Feature name-function dictionary: self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams} def all_caps(self, x): pat = re.compile(r"^[A-Z\d]+$") groups = pat.match(x) if groups: return ["f_all_caps"] def url(self, x): pat = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") groups = pat.findall(x) if groups: return ["f_url"] def ngrams(self, x): ngram_feats = self.vectorizer.fit_transform([x]) return self.vectorizer.inverse_transform(ngram_feats)[0].tolist() # An observation function that extracts features. x is a raw text def getObsFeatures(self, x, feat_list): str_feats = [] for feat in feat_list: feat = feat(x) if feat: str_feats += feat return str_feats def getYXFeatures(self, y_name, y_idx, obs_feat_list): # return y_name+'_'+str(y_idx).join(obs_feat_list) # return map(lambda x,y:x+y,y_name+'_'+str(y_idx),obs_feat_list) xy_feat = [y_name + str(y_idx) + "_" + xfeat for xfeat in obs_feat_list] # print xy_feat hashed_feats = self.hasher.transform([xy_feat]) # return hashed_feats.nonzero()[1] return hashed_feats
def trainClassifier(self): try: t1 = time() start_time = time() self.hasher = FeatureHasher(input_type='string', non_negative=True) self.clf = SVC(probability=True, C=5., gamma=0.001) data_folder = self.root_dir + "/training_data" train_dataset = load_files(data_folder) print("Time taken to load the data=>", time() - start_time) cache = dict(train=train_dataset) self.data_train = self.fetch_data(cache, subset='train') try: # print data_train.target_names print "Loading pickle" self.clf = pickle.load(open("model.pickle", "rb")) print "trained and ready ..." except: import traceback print traceback.format_exc() print "Generating pickles" training_data = [] for text in self.data_train.data: text = text.decode('utf-8', 'ignore') training_data.append(text) raw_X = (self.token_ques(text) for text in training_data ) #Type of raw_X <type 'generator'> X_train = self.hasher.fit_transform(raw_X) y_train = self.data_train.target self.clf.fit(X_train, y_train) readselfclf = open('model.pickle', 'wb') pickle.dump(self.clf, readselfclf) readselfclf.close() print "Training ended" print("Classifier trained ...") print("time taken=>", time() - t1) except Exception: import traceback print traceback.format_exc()
def __init__(self, n_features=None): # self.arg = arg import re from sklearn.feature_extraction.text import FeatureHasher from numpy.random import randn, randint from sklearn.feature_extraction.text import CountVectorizer # Define some parameters: if not n_features: n_features = 100000 # Initialize the hasher: self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True) # Initialize the ngram: self.vectorizer = CountVectorizer(binary=True) # Feature name-function dictionary: self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams}
def trainClassifier(batchSize, dataFolder, clfFolderName, tagsSplitSize): startTime = time() if not os.path.exists(clfFolderName): os.makedirs(clfFolderName) if not os.path.exists(clfFolderName + 'Temp'): os.makedirs(clfFolderName + 'Temp') tags = list(USED_TAGS.keys()) totalRows = getTotalRows('data/' + dataFolder + '/TrainIds') hasher = FeatureHasher() batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows) hashInd = 1 print 'number of tags : ' + str(len(tags)) extractor = FeatureExtractor() for _, X, _ in batchGen: batchTime = time() print 'computing batch : ' + str(hashInd) X_batch = hasher.transform(extractor.extract(sample) for sample in X) print 'saving batch : ' + str(hashInd) with open(clfFolderName + 'Temp/' + str(hashInd) + '.pkl', 'wb') as fid: cPickle.dump(X_batch, fid) print 'batch time : ' + str(time() - batchTime) hashInd += 1 with open(clfFolderName + '/hasher.pkl', 'wb') as fid: cPickle.dump(hasher, fid) with open(clfFolderName + '/extractor.pkl', 'wb') as fid: cPickle.dump(extractor, fid) print 'hashing time : ' + str(time() - startTime) tagIndDic = {} tagInd = 1 loop = 1 for currTags in [ tags[i:i + tagsSplitSize] for i in range(0, len(tags), tagsSplitSize) ]: iterStartTime = time() print 'tags iteration : ' + str(loop) clfDic = {} for tag in currTags: clfDic[tag] = Perceptron(alpha=ALPHA, n_iter=N_ITER) batchGen = batchGenerator(batchSize, dataFolder, 'Train', totalRows) batchInd = 1 for _, _, targets_in_batch in batchGen: batchTime = time() print 'batch number : ' + str(batchInd) with open(clfFolderName + 'Temp/' + str(batchInd) + '.pkl', 'rb') as fp: X_batch = cPickle.load(fp) for tag in currTags: Y_batch_binary = toBinary(tag, targets_in_batch) clfDic[tag].partial_fit(X_batch, Y_batch_binary, classes=[0, 1]) batchInd += 1 print 'batch time : ' + str(time() - batchTime) for tag in clfDic: clfDic[tag].sparsify() tagIndDic[tag] = tagInd with open(clfFolderName + '/' + str(tagInd) + '.pkl', 'wb') as fid: cPickle.dump(clfDic[tag], fid) tagInd += 1 loop += 1 print 'iter time : ' + str(time() - iterStartTime) print print 'saving model...' with open(clfFolderName + '/tagIndDic.pkl', 'wb') as fid: cPickle.dump(tagIndDic, fid) print 'total time : ' + str(time() - startTime)
class IssueClassification(object): """ Init for complain classification """ def __init__(self): # self.gm_worker = gearman.GearmanWorker(['localhost:4730']) # self.gm_worker.register_task('test_svm_rumour_classifier', self.testClassifier) self.root_dir = os.getcwd() self.trainClassifier() """ Function to fetch the data from cache @cache <dict> consist of training data """ def fetch_data(self, cache, data_home=None, subset='train', categories=None, shuffle=True, random_state=42): if subset in ('train', 'test'): data = cache[subset] else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data """ For custom tokenizing the text, removed stop words from text @text <type 'str'> text which needs to get tokenized @return <type 'str'> tokens """ def token_ques(self, text): things_to_replace = ['?'] things_to_replace += stopwords.words('english') #wh_word = None for tok in text.split('\n'): original_query = tok # 1. Stemming # 2. POS consideration verb, adjectives query_pos_tags = nltk.pos_tag(word_tokenize(tok)) for word in things_to_replace: tok = tok.lower() tok = tok.strip(" ") for word in word_tokenize(tok): yield word.lower() """ Train classifier """ def trainClassifier(self): try: t1 = time() start_time = time() self.hasher = FeatureHasher(input_type='string') self.clf = SVC(probability=True,C=5., gamma=0.001) data_folder = self.root_dir + "/training_data_issue" train_dataset = load_files(data_folder) cache = dict(train=train_dataset) self.data_train = self.fetch_data(cache, subset='train') try: self.clf = pickle.load(open("model_issue.pickle", "rb" ) ) except: import traceback print traceback.format_exc() print "Generating pickles" training_data = [] for text in self.data_train.data: text = text.decode('utf-8','ignore') training_data.append(text) raw_X = (self.token_ques(text) for text in training_data) #Type of raw_X <type 'generator'> X_train = self.hasher.fit_transform(raw_X) y_train = self.data_train.target self.clf.fit(X_train, y_train) readselfclf = open('model_issue.pickle', 'wb') pickle.dump(self.clf, readselfclf) readselfclf.close() print "Training ended" print("Classifier trained ...") print("time taken=>", time()-t1) except Exception: import traceback print traceback.format_exc() """ Function to test classifier """ def testClassifier(self, record): try: query = json.loads(record) # return json.dumps(lookup_result) query = query['complain'] result = {} test_data = [query] raw_X = (self.token_ques(text) for text in test_data) X_test = self.hasher.fit_transform(raw_X) pred = self.clf.predict(X_test) #print("pred=>", pred) self.categories = self.data_train.target_names index = 1 predict_prob = self.clf.predict_proba(X_test) for doc, category_list in zip(test_data, predict_prob): # print('\n\n') category_list = sorted(enumerate(category_list), key=lambda x:x[1], reverse=True) i = 0 for val in category_list: #print('%r => %s => %0.2f' % (doc, self.categories[val[0]], (float(val[1]) * 100))) result[self.categories[val[0]]] = val[1] * 100 except Exception: import traceback print traceback.format_exc() return result def testSingleRecord(self): while True: result = {} print "\n Enter description" desciption = raw_input() result['complain']= desciption rec_result = self.testClassifier(json.dumps( result ))
class ArticleClassification(object): """ Init for complain classification """ def __init__(self): # self.gm_worker = gearman.GearmanWorker(['localhost:4730']) # self.gm_worker.register_task('test_svm_rumour_classifier', self.testClassifier) self.root_dir = os.getcwd() self.trainClassifier() """ Function to fetch the data from cache @cache <dict> consist of training data """ def fetch_data(self, cache, data_home=None, subset='train', categories=None, shuffle=True, random_state=42): if subset in ('train', 'test'): data = cache[subset] else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data """ For custom tokenizing the text, removed stop words from text @text <type 'str'> text which needs to get tokenized @return <type 'str'> tokens """ def token_ques(self, text): things_to_replace = ['?'] things_to_replace += stopwords.words('english') #wh_word = None for tok in text.split('\n'): original_query = tok # 1. Stemming # 2. POS consideration verb, adjectives query_pos_tags = nltk.pos_tag(word_tokenize(tok)) for word in things_to_replace: tok = tok.lower() tok = tok.strip(" ") for word in word_tokenize(tok): yield word.lower() """ Train classifier """ def trainClassifier(self): try: t1 = time() start_time = time() self.hasher = FeatureHasher(input_type='string', non_negative=True) self.clf = SVC(probability=True, C=5., gamma=0.001) data_folder = self.root_dir + "/training_data" train_dataset = load_files(data_folder) print("Time taken to load the data=>", time() - start_time) cache = dict(train=train_dataset) self.data_train = self.fetch_data(cache, subset='train') try: # print data_train.target_names print "Loading pickle" self.clf = pickle.load(open("model.pickle", "rb")) print "trained and ready ..." except: import traceback print traceback.format_exc() print "Generating pickles" training_data = [] for text in self.data_train.data: text = text.decode('utf-8', 'ignore') training_data.append(text) raw_X = (self.token_ques(text) for text in training_data ) #Type of raw_X <type 'generator'> X_train = self.hasher.fit_transform(raw_X) y_train = self.data_train.target self.clf.fit(X_train, y_train) readselfclf = open('model.pickle', 'wb') pickle.dump(self.clf, readselfclf) readselfclf.close() print "Training ended" print("Classifier trained ...") print("time taken=>", time() - t1) except Exception: import traceback print traceback.format_exc() """ Function to test classifier """ def testClassifier(self, record): try: query = json.loads(record) # return json.dumps(lookup_result) query = query['complain'] result = {} test_data = [query] raw_X = (self.token_ques(text) for text in test_data) X_test = self.hasher.fit_transform(raw_X) pred = self.clf.predict(X_test) #print("pred=>", pred) self.categories = self.data_train.target_names index = 1 predict_prob = self.clf.predict_proba(X_test) for doc, category_list in zip(test_data, predict_prob): # print('\n\n') category_list = sorted(enumerate(category_list), key=lambda x: x[1], reverse=True) i = 0 for val in category_list: #print('%r => %s => %0.2f' % (doc, self.categories[val[0]], (float(val[1]) * 100))) result[self.categories[val[0]]] = val[1] * 100 except Exception: import traceback print traceback.format_exc() print result print "process ends here" def readAndCall(self): import pdb # pdb.set_trace() t1 = time() start_time = time() data_file = open('June30_data.txt') input_data = data_file.read().split("\n") total = 0 sus_count = 0 sus_count_clone = 0 sus_count_dist = 0 sus_count_dup = 0 unsus_count = 0 processed_title = [] print "Started reading mongo data" for i in range(0, (len(input_data) - 1)): print i result = {} mydata = input_data[i].split("\t(") result['title'] = mydata[0] result['description'] = mydata[1] # completed_job_request = self.gm_client.submit_job('test_svm_rumour_classifier', json.dumps( result ) ,wait_until_complete=True) rec_result = self.testClassifier(json.dumps(result)) # print rec_result rec_result = json.loads(rec_result) total += 1 if rec_result['result']['suspected'] > rec_result['result'][ 'unsuspected']: print "suspected :", sus_count sus_count += 1 dup_flag = 0 for title in processed_title: if fuzz.ratio(rec_result['title'], title) > 80: sus_count_dup += 1 dup_flag = 1 print "duplicate : ", rec_result['title'] break # if dup_flag == 0: # sus_count_dist += 1 processed_title.append(rec_result['title']) else: unsus_count += 1 print "Processed title : ", processed_title print "Result for June 30" print "Total :", total print "Suspected : ", sus_count print "duplicates : ", sus_count_dup print "Unsuspected : ", unsus_count def testSingleRecord(self): while True: result = {} print "\n Enter description" desciption = raw_input() result['complain'] = desciption rec_result = self.testClassifier(json.dumps(result))
# X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # doc = [] Y = set() for x in X: for entry in x: Y.add(entry['y']) # doc.append(entry['F']) # return X, hf.transform(doc) return X, list(Y) if __name__ == '__main__': # pass # crfutils.main(feature_extractor, fields=fields, sep=separator) X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # Apply the hashing trick hf = FeatureHasher(input_type='string',non_negative=True) # # List of dictionaries: # x_set = set() # # Iterate over each of the tokens features: doc = [] for x in X: # sg_tv.transform(x) for entry in x: # print entry['F'] doc+=entry['F'] # vec = sg_tv.transform(doc) print hf.transform(doc)
def trainClassifier(batchSize,dataFolder,clfFolderName,tagsSplitSize): startTime = time() if not os.path.exists(clfFolderName): os.makedirs(clfFolderName) if not os.path.exists(clfFolderName+'Temp'): os.makedirs(clfFolderName+'Temp') tags = list(USED_TAGS.keys()) totalRows = getTotalRows('data/'+dataFolder+'/TrainIds') hasher = FeatureHasher() batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows) hashInd = 1 print 'number of tags : ' + str(len(tags)) extractor = FeatureExtractor() for _,X,_ in batchGen: batchTime = time() print 'computing batch : ' + str(hashInd) X_batch = hasher.transform(extractor.extract(sample) for sample in X) print 'saving batch : ' + str(hashInd) with open(clfFolderName+'Temp/'+str(hashInd)+'.pkl', 'wb') as fid: cPickle.dump(X_batch, fid) print 'batch time : ' + str(time()-batchTime) hashInd+=1 with open(clfFolderName+'/hasher.pkl', 'wb') as fid: cPickle.dump(hasher, fid) with open(clfFolderName+'/extractor.pkl', 'wb') as fid: cPickle.dump(extractor, fid) print 'hashing time : ' + str(time()-startTime) tagIndDic = {} tagInd = 1 loop = 1 for currTags in [tags[i:i+tagsSplitSize] for i in range(0,len(tags),tagsSplitSize)]: iterStartTime = time() print 'tags iteration : ' + str(loop) clfDic = {} for tag in currTags: clfDic[tag] = Perceptron(alpha=ALPHA,n_iter=N_ITER) batchGen = batchGenerator(batchSize,dataFolder,'Train',totalRows) batchInd = 1 for _,_,targets_in_batch in batchGen: batchTime = time() print 'batch number : ' + str(batchInd) with open(clfFolderName+'Temp/'+str(batchInd)+'.pkl','rb') as fp: X_batch=cPickle.load(fp) for tag in currTags: Y_batch_binary = toBinary(tag,targets_in_batch) clfDic[tag].partial_fit(X_batch, Y_batch_binary, classes=[0,1]) batchInd+=1 print 'batch time : ' + str(time()-batchTime) for tag in clfDic: clfDic[tag].sparsify() tagIndDic[tag]=tagInd with open(clfFolderName+'/'+str(tagInd)+'.pkl', 'wb') as fid: cPickle.dump(clfDic[tag], fid) tagInd+=1 loop+=1 print 'iter time : ' + str(time()-iterStartTime) print print 'saving model...' with open(clfFolderName+'/tagIndDic.pkl', 'wb') as fid: cPickle.dump(tagIndDic, fid) print 'total time : ' + str(time()-startTime)