def second_training_process():
    count=10000
    category_tree=json.loads(get_categories())
    for parent_category in second_level_cat_names:
        train_x=[]
        train_y=[]
        total=0
        category_count_dict={}
        for category_id in category_tree[parent_category].keys():
            ## For vendor based model
            """
            print get_products(category_id,count)
            current_prod_list=json.loads(get_products(category_id,count=count))
            print category_id
            total=total+ len(current_prod_list)
            print total
            # print current_prod_list
            current_prod_count=len(current_prod_list)
            category_count_dict[category_id]=current_prod_count
            # print category_count_dict
            current_category_name="Delhivery_Others"
            if current_prod_count>=500:
                current_category_name=category_id
            for products in current_prod_list:
                # print products
                train_x.append(products.get('product_name',"").encode('ascii','ignore').lower())
                train_y.append(current_category_name)
            """
            try:
                hq = product_table.find({"vendor_id":"HQ","vendor_category_id":category_id})
                print "---------------------"
                print category_id,hq.count()
                print "---------------------"
                for products in hq:
                    train_x.append(products['product_name'].encode('ascii','ignore').lower())
                    current_category_name =  category_id
                    train_y.append(current_category_name)
                    # product_list.append((products,current_category_name))
            except:
                pass
        print "Training Set Constructed for %s "%(parent_category)
        print "Training Set Stats"
        print len(train_x), len(train_y)
        vocabulary=set()
        print "Constructing Vocab"
        for i,records in enumerate(train_x):
            print i
            # print records
            try:
                for word in ngrams(records.lower(),1,3):
                    if not re.match('^[0-9]+$',word):
                        vocabulary.add(word.lower())
            except:
                print records
                continue
        print "Vocab Done"

        vectorizer=feature_extraction.text.CountVectorizer(vocabulary=set(vocabulary),ngram_range=(1,3),stop_words='english')
        train_x_vectorized=vectorizer.transform(train_x)

        clf_bayes=naive_bayes.MultinomialNB(fit_prior=False)
        clf_bayes.fit(train_x_vectorized,train_y)

        joblib.dump(vectorizer,PARENT_DIR+"/Models/SubModels/Vectorizer_"+parent_category)
        joblib.dump(clf_bayes,PARENT_DIR+"/Models/SubModels/clf_bayes_"+parent_category)
        
        if parent_category in second_level_cat_names_nb:
            clf_fpr = Pipeline([
            ('feature_selection',SelectFpr(f_classif,0.05)),
            ('classification', naive_bayes.MultinomialNB(fit_prior=False))])
            clf_fpr.fit(train_x_vectorized, train_y)
            
            joblib.dump(clf_fpr,PARENT_DIR+"/Models/SubModels/clf_fpr_"+parent_category)
            
        elif parent_category in second_level_cat_names_rf:
            clf_rf = Pipeline([
            ('feature_selection', LinearSVC(C=2, penalty="l1", dual=False)),
            ('classification', RandomForestClassifier(n_estimators=100, max_depth=5000))])
            clf_rf.fit(train_x_vectorized, train_y)
            
            joblib.dump(clf_rf,PARENT_DIR+"/Models/SubModels/clf_rf_"+parent_category)
def second_training_process():
    count = 10000
    category_tree = json.loads(get_categories())
    for parent_category in second_level_cat_names:
        train_x = []
        train_y = []
        total = 0
        category_count_dict = {}
        for category_id in category_tree[parent_category].keys():
            ## For vendor based model
            """
            print get_products(category_id,count)
            current_prod_list=json.loads(get_products(category_id,count=count))
            print category_id
            total=total+ len(current_prod_list)
            print total
            # print current_prod_list
            current_prod_count=len(current_prod_list)
            category_count_dict[category_id]=current_prod_count
            # print category_count_dict
            current_category_name="Delhivery_Others"
            if current_prod_count>=500:
                current_category_name=category_id
            for products in current_prod_list:
                # print products
                train_x.append(products.get('product_name',"").encode('ascii','ignore').lower())
                train_y.append(current_category_name)
            """
            try:
                hq = product_table.find({
                    "vendor_id": "HQ",
                    "vendor_category_id": category_id
                })
                print "---------------------"
                print category_id, hq.count()
                print "---------------------"
                for products in hq:
                    train_x.append(products['product_name'].encode(
                        'ascii', 'ignore').lower())
                    current_category_name = category_id
                    train_y.append(current_category_name)
                    # product_list.append((products,current_category_name))
            except:
                pass
        print "Training Set Constructed for %s " % (parent_category)
        print "Training Set Stats"
        print len(train_x), len(train_y)
        vocabulary = set()
        print "Constructing Vocab"
        for i, records in enumerate(train_x):
            print i
            # print records
            try:
                for word in ngrams(records.lower(), 1, 3):
                    if not re.match('^[0-9]+$', word):
                        vocabulary.add(word.lower())
            except:
                print records
                continue
        print "Vocab Done"

        vectorizer = feature_extraction.text.CountVectorizer(
            vocabulary=set(vocabulary),
            ngram_range=(1, 3),
            stop_words='english')
        train_x_vectorized = vectorizer.transform(train_x)

        clf_bayes = naive_bayes.MultinomialNB(fit_prior=False)
        clf_bayes.fit(train_x_vectorized, train_y)

        joblib.dump(
            vectorizer,
            PARENT_DIR + "/Models/SubModels/Vectorizer_" + parent_category)
        joblib.dump(
            clf_bayes,
            PARENT_DIR + "/Models/SubModels/clf_bayes_" + parent_category)

        if parent_category in second_level_cat_names_nb:
            clf_fpr = Pipeline([
                ('feature_selection', SelectFpr(f_classif, 0.05)),
                ('classification', naive_bayes.MultinomialNB(fit_prior=False))
            ])
            clf_fpr.fit(train_x_vectorized, train_y)

            joblib.dump(
                clf_fpr,
                PARENT_DIR + "/Models/SubModels/clf_fpr_" + parent_category)

        elif parent_category in second_level_cat_names_rf:
            clf_rf = Pipeline([('feature_selection',
                                LinearSVC(C=2, penalty="l1", dual=False)),
                               ('classification',
                                RandomForestClassifier(n_estimators=100,
                                                       max_depth=5000))])
            clf_rf.fit(train_x_vectorized, train_y)

            joblib.dump(
                clf_rf,
                PARENT_DIR + "/Models/SubModels/clf_rf_" + parent_category)
def root_training_prcoess():
    count=1000
    category_tree=json.loads(get_categories())
    category_list=category_tree.keys()
    product_list=[]
    category_count_dict={}
    second_level_categories=set()
    train_x=[]
    train_y=[]
    # # print category_list
    # # import pdb;pdb.set_trace()
    ## For vendor based model
    """
    total=0
    for category_id in category_list:
        current_prod_list=json.loads(get_products(category_id,count=count))
        print category_id
        total=total+ len(current_prod_list)
        print total
        # print current_prod_list
        current_prod_count=len(current_prod_list)
        category_count_dict[category_id]=current_prod_count
        # print category_count_dict
        current_category_name="Delhivery_Others"
        if current_prod_count>=800:
            current_category_name=category_id
            second_level_categories.add(category_id)
        for products in current_prod_list:
            # print products
            train_x.append(products.get('product_name',"").encode('ascii','ignore').lower())
            train_y.append(current_category_name)
            product_list.append((products,current_category_name))
    """
    hq = product_table.find({"vendor_id":"HQ"})
    print "----------------"
    print "root training"
    print hq.count()
    for products in hq:
        train_x.append(products['product_name'].encode('ascii','ignore').lower())
        if products['vendor_category_id'] == 'NA':
            current_category_name = 'Delhivery_Others'
        else:
            current_category_name =  products['vendor_category_id'].split('->')[0]
        train_y.append(current_category_name)
        product_list.append((products,current_category_name))




    print "Training Set Constructed"
    print "Training Set Stats"
    print category_count_dict
    print len(train_x), len(train_y)
    # train_x_tokenized=[]

    # for records in train_x:
    #     tokens=ngrams(records.lower(),1,3)
    #     if tokens:
    #         train_x_tokenized.append(tokens)
    #     else:
    # #         print records
    #         train_x_tokenized.append([""])
    # print "Tokenized Training Set"

    vocabulary=set()
    print "Constructing Vocab"
    for i,records in enumerate(train_x):
        print i
        try:
            for word in ngrams(records.lower(),1,3):
                if not re.match('^[0-9]+$',word):
                    vocabulary.add(word.lower())
        except:
            print records
            continue
    print "Vocab Done"

    vectorizer=feature_extraction.text.CountVectorizer(vocabulary=set(vocabulary),ngram_range=(1,3),stop_words='english')
    train_x_vectorized=vectorizer.transform(train_x)

    clf_bayes=naive_bayes.MultinomialNB(fit_prior=False)
    clf_bayes.fit(train_x_vectorized,train_y)

    print "model 1 done"

    clf_chi = Pipeline([
        ('feature_selection',SelectPercentile(chi2,20)),
  ('classification', naive_bayes.MultinomialNB(fit_prior=False))])
    clf_chi.fit(train_x_vectorized, train_y)

    print "model 2 done"
    
    clf_rf = Pipeline([
        ('feature_selection', LinearSVC(C=2, penalty="l1", dual=False)),
  ('classification', RandomForestClassifier(n_estimators=100, max_depth=1000))])
    clf_rf.fit(train_x_vectorized, train_y)
   
    print "model 3 done"
    
    print os.path.dirname(os.path.realpath('__file__'))+'/../Models/clf_bayes.pkl'
    joblib.dump(clf_bayes, os.path.dirname(os.path.realpath('__file__'))+'/Models/clf_bayes.pkl')
    joblib.dump(clf_chi, os.path.dirname(os.path.realpath('__file__'))+'/Models/clf_chi.pkl')
    joblib.dump(clf_rf, os.path.dirname(os.path.realpath('__file__'))+'/Models/clf_l1_rf.pkl')
    joblib.dump(vectorizer,os.path.dirname(os.path.realpath('__file__'))+'/Models/vectorizer.pkl')
def root_training_prcoess():
    count = 1000
    category_tree = json.loads(get_categories())
    category_list = category_tree.keys()
    product_list = []
    category_count_dict = {}
    second_level_categories = set()
    train_x = []
    train_y = []
    # # print category_list
    # # import pdb;pdb.set_trace()
    ## For vendor based model
    """
    total=0
    for category_id in category_list:
        current_prod_list=json.loads(get_products(category_id,count=count))
        print category_id
        total=total+ len(current_prod_list)
        print total
        # print current_prod_list
        current_prod_count=len(current_prod_list)
        category_count_dict[category_id]=current_prod_count
        # print category_count_dict
        current_category_name="Delhivery_Others"
        if current_prod_count>=800:
            current_category_name=category_id
            second_level_categories.add(category_id)
        for products in current_prod_list:
            # print products
            train_x.append(products.get('product_name',"").encode('ascii','ignore').lower())
            train_y.append(current_category_name)
            product_list.append((products,current_category_name))
    """
    hq = product_table.find({"vendor_id": "HQ"})
    print "----------------"
    print "root training"
    print hq.count()
    for products in hq:
        train_x.append(products['product_name'].encode('ascii',
                                                       'ignore').lower())
        if products['vendor_category_id'] == 'NA':
            current_category_name = 'Delhivery_Others'
        else:
            current_category_name = products['vendor_category_id'].split(
                '->')[0]
        train_y.append(current_category_name)
        product_list.append((products, current_category_name))

    print "Training Set Constructed"
    print "Training Set Stats"
    print category_count_dict
    print len(train_x), len(train_y)
    # train_x_tokenized=[]

    # for records in train_x:
    #     tokens=ngrams(records.lower(),1,3)
    #     if tokens:
    #         train_x_tokenized.append(tokens)
    #     else:
    # #         print records
    #         train_x_tokenized.append([""])
    # print "Tokenized Training Set"

    vocabulary = set()
    print "Constructing Vocab"
    for i, records in enumerate(train_x):
        print i
        try:
            for word in ngrams(records.lower(), 1, 3):
                if not re.match('^[0-9]+$', word):
                    vocabulary.add(word.lower())
        except:
            print records
            continue
    print "Vocab Done"

    vectorizer = feature_extraction.text.CountVectorizer(
        vocabulary=set(vocabulary), ngram_range=(1, 3), stop_words='english')
    train_x_vectorized = vectorizer.transform(train_x)

    clf_bayes = naive_bayes.MultinomialNB(fit_prior=False)
    clf_bayes.fit(train_x_vectorized, train_y)

    print "model 1 done"

    clf_chi = Pipeline([('feature_selection', SelectPercentile(chi2, 20)),
                        ('classification',
                         naive_bayes.MultinomialNB(fit_prior=False))])
    clf_chi.fit(train_x_vectorized, train_y)

    print "model 2 done"

    clf_rf = Pipeline([('feature_selection',
                        LinearSVC(C=2, penalty="l1", dual=False)),
                       ('classification',
                        RandomForestClassifier(n_estimators=100,
                                               max_depth=1000))])
    clf_rf.fit(train_x_vectorized, train_y)

    print "model 3 done"

    print os.path.dirname(
        os.path.realpath('__file__')) + '/../Models/clf_bayes.pkl'
    joblib.dump(
        clf_bayes,
        os.path.dirname(os.path.realpath('__file__')) +
        '/Models/clf_bayes.pkl')
    joblib.dump(
        clf_chi,
        os.path.dirname(os.path.realpath('__file__')) + '/Models/clf_chi.pkl')
    joblib.dump(
        clf_rf,
        os.path.dirname(os.path.realpath('__file__')) +
        '/Models/clf_l1_rf.pkl')
    joblib.dump(
        vectorizer,
        os.path.dirname(os.path.realpath('__file__')) +
        '/Models/vectorizer.pkl')