def second_training_process(): count=10000 category_tree=json.loads(get_categories()) for parent_category in second_level_cat_names: train_x=[] train_y=[] total=0 category_count_dict={} for category_id in category_tree[parent_category].keys(): ## For vendor based model """ print get_products(category_id,count) current_prod_list=json.loads(get_products(category_id,count=count)) print category_id total=total+ len(current_prod_list) print total # print current_prod_list current_prod_count=len(current_prod_list) category_count_dict[category_id]=current_prod_count # print category_count_dict current_category_name="Delhivery_Others" if current_prod_count>=500: current_category_name=category_id for products in current_prod_list: # print products train_x.append(products.get('product_name',"").encode('ascii','ignore').lower()) train_y.append(current_category_name) """ try: hq = product_table.find({"vendor_id":"HQ","vendor_category_id":category_id}) print "---------------------" print category_id,hq.count() print "---------------------" for products in hq: train_x.append(products['product_name'].encode('ascii','ignore').lower()) current_category_name = category_id train_y.append(current_category_name) # product_list.append((products,current_category_name)) except: pass print "Training Set Constructed for %s "%(parent_category) print "Training Set Stats" print len(train_x), len(train_y) vocabulary=set() print "Constructing Vocab" for i,records in enumerate(train_x): print i # print records try: for word in ngrams(records.lower(),1,3): if not re.match('^[0-9]+$',word): vocabulary.add(word.lower()) except: print records continue print "Vocab Done" vectorizer=feature_extraction.text.CountVectorizer(vocabulary=set(vocabulary),ngram_range=(1,3),stop_words='english') train_x_vectorized=vectorizer.transform(train_x) clf_bayes=naive_bayes.MultinomialNB(fit_prior=False) clf_bayes.fit(train_x_vectorized,train_y) joblib.dump(vectorizer,PARENT_DIR+"/Models/SubModels/Vectorizer_"+parent_category) joblib.dump(clf_bayes,PARENT_DIR+"/Models/SubModels/clf_bayes_"+parent_category) if parent_category in second_level_cat_names_nb: clf_fpr = Pipeline([ ('feature_selection',SelectFpr(f_classif,0.05)), ('classification', naive_bayes.MultinomialNB(fit_prior=False))]) clf_fpr.fit(train_x_vectorized, train_y) joblib.dump(clf_fpr,PARENT_DIR+"/Models/SubModels/clf_fpr_"+parent_category) elif parent_category in second_level_cat_names_rf: clf_rf = Pipeline([ ('feature_selection', LinearSVC(C=2, penalty="l1", dual=False)), ('classification', RandomForestClassifier(n_estimators=100, max_depth=5000))]) clf_rf.fit(train_x_vectorized, train_y) joblib.dump(clf_rf,PARENT_DIR+"/Models/SubModels/clf_rf_"+parent_category)
def second_training_process(): count = 10000 category_tree = json.loads(get_categories()) for parent_category in second_level_cat_names: train_x = [] train_y = [] total = 0 category_count_dict = {} for category_id in category_tree[parent_category].keys(): ## For vendor based model """ print get_products(category_id,count) current_prod_list=json.loads(get_products(category_id,count=count)) print category_id total=total+ len(current_prod_list) print total # print current_prod_list current_prod_count=len(current_prod_list) category_count_dict[category_id]=current_prod_count # print category_count_dict current_category_name="Delhivery_Others" if current_prod_count>=500: current_category_name=category_id for products in current_prod_list: # print products train_x.append(products.get('product_name',"").encode('ascii','ignore').lower()) train_y.append(current_category_name) """ try: hq = product_table.find({ "vendor_id": "HQ", "vendor_category_id": category_id }) print "---------------------" print category_id, hq.count() print "---------------------" for products in hq: train_x.append(products['product_name'].encode( 'ascii', 'ignore').lower()) current_category_name = category_id train_y.append(current_category_name) # product_list.append((products,current_category_name)) except: pass print "Training Set Constructed for %s " % (parent_category) print "Training Set Stats" print len(train_x), len(train_y) vocabulary = set() print "Constructing Vocab" for i, records in enumerate(train_x): print i # print records try: for word in ngrams(records.lower(), 1, 3): if not re.match('^[0-9]+$', word): vocabulary.add(word.lower()) except: print records continue print "Vocab Done" vectorizer = feature_extraction.text.CountVectorizer( vocabulary=set(vocabulary), ngram_range=(1, 3), stop_words='english') train_x_vectorized = vectorizer.transform(train_x) clf_bayes = naive_bayes.MultinomialNB(fit_prior=False) clf_bayes.fit(train_x_vectorized, train_y) joblib.dump( vectorizer, PARENT_DIR + "/Models/SubModels/Vectorizer_" + parent_category) joblib.dump( clf_bayes, PARENT_DIR + "/Models/SubModels/clf_bayes_" + parent_category) if parent_category in second_level_cat_names_nb: clf_fpr = Pipeline([ ('feature_selection', SelectFpr(f_classif, 0.05)), ('classification', naive_bayes.MultinomialNB(fit_prior=False)) ]) clf_fpr.fit(train_x_vectorized, train_y) joblib.dump( clf_fpr, PARENT_DIR + "/Models/SubModels/clf_fpr_" + parent_category) elif parent_category in second_level_cat_names_rf: clf_rf = Pipeline([('feature_selection', LinearSVC(C=2, penalty="l1", dual=False)), ('classification', RandomForestClassifier(n_estimators=100, max_depth=5000))]) clf_rf.fit(train_x_vectorized, train_y) joblib.dump( clf_rf, PARENT_DIR + "/Models/SubModels/clf_rf_" + parent_category)
def root_training_prcoess(): count=1000 category_tree=json.loads(get_categories()) category_list=category_tree.keys() product_list=[] category_count_dict={} second_level_categories=set() train_x=[] train_y=[] # # print category_list # # import pdb;pdb.set_trace() ## For vendor based model """ total=0 for category_id in category_list: current_prod_list=json.loads(get_products(category_id,count=count)) print category_id total=total+ len(current_prod_list) print total # print current_prod_list current_prod_count=len(current_prod_list) category_count_dict[category_id]=current_prod_count # print category_count_dict current_category_name="Delhivery_Others" if current_prod_count>=800: current_category_name=category_id second_level_categories.add(category_id) for products in current_prod_list: # print products train_x.append(products.get('product_name',"").encode('ascii','ignore').lower()) train_y.append(current_category_name) product_list.append((products,current_category_name)) """ hq = product_table.find({"vendor_id":"HQ"}) print "----------------" print "root training" print hq.count() for products in hq: train_x.append(products['product_name'].encode('ascii','ignore').lower()) if products['vendor_category_id'] == 'NA': current_category_name = 'Delhivery_Others' else: current_category_name = products['vendor_category_id'].split('->')[0] train_y.append(current_category_name) product_list.append((products,current_category_name)) print "Training Set Constructed" print "Training Set Stats" print category_count_dict print len(train_x), len(train_y) # train_x_tokenized=[] # for records in train_x: # tokens=ngrams(records.lower(),1,3) # if tokens: # train_x_tokenized.append(tokens) # else: # # print records # train_x_tokenized.append([""]) # print "Tokenized Training Set" vocabulary=set() print "Constructing Vocab" for i,records in enumerate(train_x): print i try: for word in ngrams(records.lower(),1,3): if not re.match('^[0-9]+$',word): vocabulary.add(word.lower()) except: print records continue print "Vocab Done" vectorizer=feature_extraction.text.CountVectorizer(vocabulary=set(vocabulary),ngram_range=(1,3),stop_words='english') train_x_vectorized=vectorizer.transform(train_x) clf_bayes=naive_bayes.MultinomialNB(fit_prior=False) clf_bayes.fit(train_x_vectorized,train_y) print "model 1 done" clf_chi = Pipeline([ ('feature_selection',SelectPercentile(chi2,20)), ('classification', naive_bayes.MultinomialNB(fit_prior=False))]) clf_chi.fit(train_x_vectorized, train_y) print "model 2 done" clf_rf = Pipeline([ ('feature_selection', LinearSVC(C=2, penalty="l1", dual=False)), ('classification', RandomForestClassifier(n_estimators=100, max_depth=1000))]) clf_rf.fit(train_x_vectorized, train_y) print "model 3 done" print os.path.dirname(os.path.realpath('__file__'))+'/../Models/clf_bayes.pkl' joblib.dump(clf_bayes, os.path.dirname(os.path.realpath('__file__'))+'/Models/clf_bayes.pkl') joblib.dump(clf_chi, os.path.dirname(os.path.realpath('__file__'))+'/Models/clf_chi.pkl') joblib.dump(clf_rf, os.path.dirname(os.path.realpath('__file__'))+'/Models/clf_l1_rf.pkl') joblib.dump(vectorizer,os.path.dirname(os.path.realpath('__file__'))+'/Models/vectorizer.pkl')
def root_training_prcoess(): count = 1000 category_tree = json.loads(get_categories()) category_list = category_tree.keys() product_list = [] category_count_dict = {} second_level_categories = set() train_x = [] train_y = [] # # print category_list # # import pdb;pdb.set_trace() ## For vendor based model """ total=0 for category_id in category_list: current_prod_list=json.loads(get_products(category_id,count=count)) print category_id total=total+ len(current_prod_list) print total # print current_prod_list current_prod_count=len(current_prod_list) category_count_dict[category_id]=current_prod_count # print category_count_dict current_category_name="Delhivery_Others" if current_prod_count>=800: current_category_name=category_id second_level_categories.add(category_id) for products in current_prod_list: # print products train_x.append(products.get('product_name',"").encode('ascii','ignore').lower()) train_y.append(current_category_name) product_list.append((products,current_category_name)) """ hq = product_table.find({"vendor_id": "HQ"}) print "----------------" print "root training" print hq.count() for products in hq: train_x.append(products['product_name'].encode('ascii', 'ignore').lower()) if products['vendor_category_id'] == 'NA': current_category_name = 'Delhivery_Others' else: current_category_name = products['vendor_category_id'].split( '->')[0] train_y.append(current_category_name) product_list.append((products, current_category_name)) print "Training Set Constructed" print "Training Set Stats" print category_count_dict print len(train_x), len(train_y) # train_x_tokenized=[] # for records in train_x: # tokens=ngrams(records.lower(),1,3) # if tokens: # train_x_tokenized.append(tokens) # else: # # print records # train_x_tokenized.append([""]) # print "Tokenized Training Set" vocabulary = set() print "Constructing Vocab" for i, records in enumerate(train_x): print i try: for word in ngrams(records.lower(), 1, 3): if not re.match('^[0-9]+$', word): vocabulary.add(word.lower()) except: print records continue print "Vocab Done" vectorizer = feature_extraction.text.CountVectorizer( vocabulary=set(vocabulary), ngram_range=(1, 3), stop_words='english') train_x_vectorized = vectorizer.transform(train_x) clf_bayes = naive_bayes.MultinomialNB(fit_prior=False) clf_bayes.fit(train_x_vectorized, train_y) print "model 1 done" clf_chi = Pipeline([('feature_selection', SelectPercentile(chi2, 20)), ('classification', naive_bayes.MultinomialNB(fit_prior=False))]) clf_chi.fit(train_x_vectorized, train_y) print "model 2 done" clf_rf = Pipeline([('feature_selection', LinearSVC(C=2, penalty="l1", dual=False)), ('classification', RandomForestClassifier(n_estimators=100, max_depth=1000))]) clf_rf.fit(train_x_vectorized, train_y) print "model 3 done" print os.path.dirname( os.path.realpath('__file__')) + '/../Models/clf_bayes.pkl' joblib.dump( clf_bayes, os.path.dirname(os.path.realpath('__file__')) + '/Models/clf_bayes.pkl') joblib.dump( clf_chi, os.path.dirname(os.path.realpath('__file__')) + '/Models/clf_chi.pkl') joblib.dump( clf_rf, os.path.dirname(os.path.realpath('__file__')) + '/Models/clf_l1_rf.pkl') joblib.dump( vectorizer, os.path.dirname(os.path.realpath('__file__')) + '/Models/vectorizer.pkl')