Esempio n. 1
0
def getFeatureRMSEAgainstBaseline(cols=['color_exist']):
    utility = Utility()
    utility.startTimeTrack()
    # This part skips the feature training and simply use it.
    print("len(cols):", len(cols), cols)
    print("Reading feature set")
    all_df = pd.read_csv('../data/features_doc2vec_sense2vec_pmi_20170418.csv')
    feature_train_df = all_df[:74067]
    # Must drop these columns for OrdinalRegression
    feature_train_df.drop('wm_product_brand', axis=1, inplace=True)

    cols.append('relevance_int')
    cols.append('id')
    cols.append('search_term')
    cols.append('product_uid')
    cols.append('relevance')
    cols.append('product_idx')
    cols.append('Word2VecQueryExpansion')

    print(cols)
    feature_train_df = feature_train_df.filter(items=cols, axis=1)

    feature_test_df = all_df[74067:]
    feature_test_df.drop('relevance', axis=1, inplace=True)
    utility.checkpointTimeTrack()

    print("####  Running: OrdinalRegression ordridge training ####")
    # dp=DataPreprocessing()
    print("feature_train_df:", list(feature_train_df))
    # trainDF,validateDF=dp.generateValidationSet(train_df)
    orModel = OrdinalRegressionRanker('ordridge')
    orModel.train(feature_train_df, None)
    # orModel.gridSearch(feature_train_df, None)
    print("####  Completed: OrdinalRegression ordridge training ####")
    utility.checkpointTimeTrack()
Esempio n. 2
0
# cols=['color_exist','len_product_description']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','len_brand']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','len_search_term']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','sense2vec_all_simscore','sense2vec_keeptag_simscore','sense2vec_uidfact_all_simscore','sense2vec_uidfact_keeptag_simscore','sense2vec_all_attr_simscore','sense2vec_keeptag_attr_simscore','sense2vec_uidfact_all_attr_simscore','sense2vec_uidfact_keeptag_attr_simscore']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','product_uid_threshold']
# getFeatureRMSEAgainstBaseline(cols)
# cols=['color_exist','noun_overlap_counts','noun_uniq_overlap_counts','noun_overlap_ratio']
# getFeatureRMSEAgainstBaseline(cols)

if __name__ == "__main__":
    # print("Should not print")
    utility = Utility()
    utility.startTimeTrack()
    # This part skips the feature training and simply use it.

    # print("Reading features_full_plusnouns set")
    # all_df=pd.read_csv('../data/features_full_plusnouns_pluspuidthresh.csv')

    myFeatureSetFileReference = '../data/features_doc2vec_sense2vec_pmi_20170418.csv'
    print("Reading features_doc2vec_sense2vec_pmi_20170418 set")
    all_df = pd.read_csv(myFeatureSetFileReference, low_memory=True)
    print("Completed: Reading features_doc2vec_sense2vec_pmi_20170418 set")
    feature_train_df = all_df[:74067]

    # feature_train_df.drop('doc2vec_search_term_vector', axis=1, inplace=True)
    # feature_train_df.drop('doc2vec_product_title_vector', axis=1, inplace=True)
    # feature_train_df.drop('doc2vec_product_brand_vector', axis=1, inplace=True)
Esempio n. 3
0
    def getFeature(self, train_query_df, product_df, attribute_df, test_query_df,
                   features="brand,attribute,spelling,nonascii,stopwords,colorExist,color_onehot,brandExist,wmdistance,stemming,word2vec,Word2VecQueryExpansion,tfidf,tfidf_expandedquery,doc2vec,doc2vec_expandedquery,bm25,bm25expandedquery,doclength"):
        ## Please feel free to add feature into this method.
        ## For testing, you may want to comment out some feature generation to save time
        ## as some takes a long time to run.

        timetracker=Utility()
        if features.find("brand") != -1:
            # Create Brand Column
            product_df = self.__createBrandColumn(product_df, attribute_df)

        if features.find("attribute") != -1:
            # Create Attribute column as a JSON string
            # Column name is attr_json
            product_df = self.__createAttributeColumn(product_df, attribute_df)

        if features.find("spelling") != -1:
            # Perform spell correction on search_term
            print("Performing spell correction")
            spell_dict = Feature_Spelling.getSpellingCorrectionDict()
            # print(self.__spell_correction('lifeswivel', spell_dict))
            train_query_df['search_term'] = train_query_df['search_term'].map(
                lambda x: self.__spell_correction(x, spell_dict))
            product_df['product_description'] = product_df['product_description'].map(
                lambda x: self.__spell_correction(x, spell_dict))
            product_df['product_title'] = product_df['product_title'].map(
                lambda x: self.__spell_correction(x, spell_dict))
            product_df['attr_json'] = product_df['attr_json'].map(
                lambda x: self.__spell_correction(str(x), spell_dict))

        if features.find("nonascii") != -1:
            # Remove non-ascii characters
            print("Performing non-ascii removal")
            start_time = time.time()
            train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__nonascii_clean((x)))
            print("Non-ascii clean on search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_title'] = product_df['product_title'].map(lambda x: self.__nonascii_clean(str(x)))
            print("Non-ascii clean on product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        # Run this to download the download the stopword list if you hit error
        # nltk.download()

        if features.find("stopwords") != -1:
            # Stopwords removal
            print("Performing stopwords removal")
            start_time = time.time()
            train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__stopword_removal((x)))
            print("stopwords removal on search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_title'] = product_df['product_title'].map(lambda x: self.__stopword_removal(str(x)))
            print("stopwords removal on product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_description'] = product_df['product_description'].map(lambda x: self.__stopword_removal(str(x)))
            print("stopwords removal on product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['attr_json'] = product_df['attr_json'].map(lambda x: self.__stopword_removal(str(x)))
            print("stopwords removal on attr_jason took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("colorExist") != -1:
            # Check if color in search_term exist in product_description column
            print("Performing color and material check")
            start_time = time.time()
            color = Feature_ColorMaterial()
            train_query_df['color'] = color.checkColorMaterialExists(train_query_df, product_df)
            train_query_df['color_exist'] = train_query_df['color'].map(lambda x: 1 if len(x)>0 else 0)
            # Save some memory. Change it to uint8
            train_query_df.color_exist = train_query_df.color_exist.astype(np.uint8)

            if features.find("color_onehot") != -1:
                train_query_df = self.__onehot_color(train_query_df)

            # Clean up unused column
            train_query_df.pop('color')
            print("Color and material check took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("brandExist") != -1:
            # Check if brand in search term exist product_brand column
            print("Performing brand check")
            start_time = time.time()

            train_query_df['brand_exist'] = self.__brandExist(train_query_df, product_df)
            # train_query_df['brand_exist'] = train_query_df['search_term'].map(lambda x: 1 if len(x)>0 else 0)
            print("Brand check took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find('wmdistance') != -1:
            print("Performing Word Mover Distance")
            start_time = time.time()

            wm = Feature_WordMoverDistance()
            train_query_df['wm_product_description'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'product_description')
            print("WMDistance for product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            train_query_df['wm_product_title'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'product_title')
            print("WMDistance for product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            train_query_df['wm_product_brand'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'product_brand')
            print("WMDistance for product_brand took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            train_query_df['wm_attr_json'] = wm.getDistance(train_query_df, 'search_term',
                                                                      product_df, 'attr_json')
            print("WMDistance for attr_json took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("stemming") != -1:
            # # Stemming
            print("Performing Stemming")
            start_time = time.time()
            train_query_df['search_term'] = train_query_df['search_term'].map(lambda x: self.__stemming((x)))
            print("Stemming search_term took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_title'] = product_df['product_title'].map(lambda x: self.__stemming(str(x)))
            print("Stemming product_title took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_brand'] = product_df['product_brand'].map(lambda x: self.__stemming(str(x)))
            print("Stemming product_brand took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['product_description'] = product_df['product_description'].map(lambda x: self.__stemming(str(x)))
            print("Stemming product_description took: %s minutes" % round(((time.time() - start_time) / 60), 2))
            product_df['attr_json'] = product_df['attr_json'].map(lambda x: self.__stemming(str(x)))
            print("Stemming attr_json took: %s minutes" % round(((time.time() - start_time) / 60), 2))

        if features.find("word2vec") != -1:
            # Word2Vec
            print("===========Performing word2vec computation....this may take a while")
            timetracker.startTimeTrack()
            print("Merging product_title and description")
            print(list(product_df))
            product_df['content'] = product_df['product_title'].map(str) + " " + \
                                    product_df['product_description'].map(str) + " " + \
                                    product_df['product_brand'].map(str)
            timetracker.checkpointTimeTrack()
            print("Adding training query for that product id into the content")
            product_df = product_df.reset_index(drop=True)
            counter = 0
            for index, product in product_df.iterrows():
                # print("product:", product)
                productId = product['product_uid']
                # print("productId:",productId)
                df = train_query_df[train_query_df.product_uid == productId]
                # print("df:",df)
                searchterms = ""
                for index, row in df.iterrows():
                    searchterm = row['search_term']
                    searchterms = searchterms + " " + searchterm

                newString = product_df.iloc[counter]['content'] + " " + searchterms
                product_df.set_value(counter, 'content', newString)

                counter = counter + 1

            timetracker.checkpointTimeTrack()

            w2v = Feature_Word2Vec.Feature_Word2Vec()
            print("Convert DF into sentences for word2vec processing")
            sentences = w2v.convertDFIntoSentences(product_df, 'content')
            timetracker.checkpointTimeTrack()
            print("Training word2vec")
            w2v.trainModel(sentences)
            timetracker.checkpointTimeTrack()
            print("Validating...this should give some results like sofa")
            print(w2v.getVectorFromWord('stool'))
            print(w2v.getSimilarWordVectors('stool', 5))
            print("===========Completed word2vec computation")

        ##WARNING: This has to be before bm25expandedquery function call
        if features.find("Word2VecQueryExpansion") != -1:
            # Word2VecQueryExpansion
            print("===========Performing Word2VecQueryExpansion computation....this may take a super long time")
            timetracker.startTimeTrack()
            # print("Merging product_title and description")
            # print(list(product_df))
            # product_df['content']=product_df['product_title'].map(str) +" "+ \
            #                       product_df['product_description'].map(str) + " " + \
            #                       product_df['product_brand'].map(str)
            # product_df.head(1)
            print("Compute Word2VecQueryExpansion")
            w2cExpand = Word2VecQueryExpansion()
            timetracker.checkpointTimeTrack()
            # print("Remove merged column")
            # product_df=product_df.drop('content', axis=1)
            # For every training query-document pair, generate bm25
            print("Generate Word2VecQueryExpansion column")
            train_query_df = w2cExpand.computeExpandedQueryColumn(trainset=train_query_df,
                                                                  colName='Word2VecQueryExpansion')
            timetracker.checkpointTimeTrack()
            print("train_query_df:", list(train_query_df))
            print("train_query_df head:", train_query_df.head(1))
            print("Saving to csv")
            train_query_df.to_csv('../data.prune/train_query_with_Word2VecQueryExpansion.csv')
            timetracker.checkpointTimeTrack()
            print("===========Completed Word2VecQueryExpansion computation")

        if features.find("tfidf") != -1:
            # TF-IDF
            print("Performing TF-IDF")
            tfidf = Feature_TFIDF()
            train_query_df['tfidf_product_title'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_title')
            train_query_df['tfidf_product_brand'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_brand')
            train_query_df['tfidf_product_description'] = tfidf.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_description')
            train_query_df['tfidf_attr_json'] = tfidf.getCosineSimilarity(train_query_df, 'search_term',
                                                                                    product_df,
                                                                                    'attr_json')
        if features.find("tfidf_expandedquery") != -1:
            # TF-IDF on expanded query
            print("Performing TF-IDF with expanded query")
            tfidf = Feature_TFIDF()
            train_query_df['tfidf_expanded_product_title'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df,
                                                                              'product_title')
            train_query_df['tfidf_expanded_product_brand'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df,
                                                                              'product_brand')
            train_query_df['tfidf_expanded_product_description'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion', product_df,
                                                                              'product_description')
            train_query_df['tfidf_expanded_attr_json'] = tfidf.getCosineSimilarity(train_query_df, 'Word2VecQueryExpansion',
                                                                                    product_df,
                                                                                    'attr_json')

        if features.find("doc2vec") != -1:
            # Doc2Vec
            print("Performing Doc2Vec")
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_product_title'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_title')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_product_brand'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_brand')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_product_description'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term', product_df,
                                                                              'product_description')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_attr_json'] = doc2vec.getCosineSimilarity(train_query_df, 'search_term',
                                                                                        product_df,
                                                                                        'attr_json')

        if features.find("doc2vec_expandedquery") != -1:
            # Doc2Vec
            print("Performing Doc2Vec with expanded query")
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_product_title'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                                  'Word2VecQueryExpansion',
                                                                                  product_df,
                                                                                  'product_title')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_product_brand'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                                  'Word2VecQueryExpansion',
                                                                                  product_df,
                                                                                  'product_brand')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_product_description'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                                        'Word2VecQueryExpansion',
                                                                                        product_df,
                                                                                        'product_description')
            doc2vec = Feature_Doc2Vec()
            train_query_df['doc2vec_expanded_attr_json'] = doc2vec.getCosineSimilarity(train_query_df,
                                                                              'Word2VecQueryExpansion',
                                                                              product_df,
                                                                              'attr_json')

        if features.find("bm25") != -1:
            # BM25
            print("===========Performing BM25 computation....this may take a while")
            timetracker.startTimeTrack()
            print("Merging product_title and description")
            print(list(product_df))
            product_df['content']=product_df['product_title'].map(str) +" "+ \
                                  product_df['product_description'].map(str) + " " + \
                                  product_df['product_brand'].map(str)
            timetracker.checkpointTimeTrack()

            print("Adding training query for that product id into the content")
            product_df=product_df.reset_index(drop=True)
            counter=0
            for index,product in product_df.iterrows():
                # print("product:", product)
                productId=product['product_uid']
                # print("productId:",productId)
                df=train_query_df[train_query_df.product_uid==productId]
                # print("df:",df)
                searchterms=""
                for index,row in df.iterrows():
                    searchterm=row['search_term']
                    searchterms=searchterms+" "+searchterm

                newString=product_df.iloc[counter]['content']+" "+searchterms
                product_df.set_value(counter,'content',newString)

                counter=counter+1

            timetracker.checkpointTimeTrack()

            print("Compute BM25")
            bm25 = Feature_BM25(product_df)
            timetracker.checkpointTimeTrack()
            print("Remove merged column")
            product_df=product_df.drop('content', axis=1)
            #For every training query-document pair, generate bm25
            print("Generate bm25 column")
            train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25', searchTermColname='search_term')
            timetracker.checkpointTimeTrack()
            print("train_query_df:",list(train_query_df))
            print("train_query_df head:",train_query_df.head(1))
            print("Saving to csv")
            train_query_df.to_csv('../data.prune/train_query_with_bm25_search_term.csv')
            timetracker.checkpointTimeTrack()
            print("===========Completed BM25 computation")

        if features.find("bm25expandedquery") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing BM25expanded computation....this may take a while")
                timetracker.startTimeTrack()
                print("Merging product_title and description")
                print(list(product_df))
                product_df['content']=product_df['product_title'].map(str) +" "+ \
                                      product_df['product_description'].map(str) + " " + \
                                      product_df['product_brand'].map(str)
                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25expandedquery', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed BM25expanded computation")
            else:
                print("ERROR: Cannot proceed with bm25expandedquery. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")


        if features.find("bm25description") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing bm25description computation....this may take a while")
                timetracker.startTimeTrack()
                print(list(product_df))
                # product_df['content']=product_df['product_title'].map(str) +" "+ \
                #                       product_df['product_description'].map(str) + " " + \
                #                       product_df['product_brand'].map(str)
                product_df['content']=product_df['product_description'].map(str)

                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25description', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed bm25description computation")
            else:
                print("ERROR: Cannot proceed with bm25description. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")


        if features.find("bm25title") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing bm25title computation....this may take a while")
                timetracker.startTimeTrack()
                print(list(product_df))
                # product_df['content']=product_df['product_title'].map(str) +" "+ \
                #                       product_df['product_description'].map(str) + " " + \
                #                       product_df['product_brand'].map(str)
                product_df['content']=product_df['product_title'].map(str)

                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25title', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed bm25title computation")
            else:
                print("ERROR: Cannot proceed with bm25title. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")


        if features.find("bm25brand") != -1:
            if features.find("Word2VecQueryExpansion") != -1:
                # bm25expandedquery
                print("===========Performing bm25brand computation....this may take a while")
                timetracker.startTimeTrack()
                print(list(product_df))
                # product_df['content']=product_df['product_title'].map(str) +" "+ \
                #                       product_df['product_description'].map(str) + " " + \
                #                       product_df['product_brand'].map(str)
                product_df['content']=product_df['product_brand'].map(str)

                product_df.head(1)
                timetracker.checkpointTimeTrack()

                print("Adding training query for that product id into the content")
                product_df = product_df.reset_index(drop=True)
                counter = 0
                for index, product in product_df.iterrows():
                    # print("product:", product)
                    productId = product['product_uid']
                    # print("productId:",productId)
                    df = train_query_df[train_query_df.product_uid == productId]
                    # print("df:",df)
                    searchterms = ""
                    for index, row in df.iterrows():
                        searchterm = row['search_term']
                        searchterms = searchterms + " " + searchterm

                    newString = product_df.iloc[counter]['content'] + " " + searchterms
                    product_df.set_value(counter, 'content', newString)

                    counter = counter + 1

                timetracker.checkpointTimeTrack()


                print("Compute BM25")
                bm25 = Feature_BM25(product_df)
                timetracker.checkpointTimeTrack()
                print("Remove merged column")
                product_df=product_df.drop('content', axis=1)
                #For every training query-document pair, generate bm25
                print("Generate bm25 column")
                train_query_df=bm25.computeBM25Column(trainset=train_query_df,destColName='bm25brand', searchTermColname='Word2VecQueryExpansion')
                timetracker.checkpointTimeTrack()
                print("train_query_df:",list(train_query_df))
                print("train_query_df head:",train_query_df.head(1))
                print("Saving to csv")
                train_query_df.to_csv('../data.prune/train_query_with_bm25_Word2VecQueryExpansion.csv')
                timetracker.checkpointTimeTrack()
                print("===========Completed bm25brand computation")
            else:
                print("ERROR: Cannot proceed with bm25brand. Word2VecQueryExpansion is not enabled. It is a prerequisite of bm25expandedquery.")



        if features.find("doclength") != -1:
            # Document Length
            print("Performing Document Length")
            product_df['len_product_title'] = product_df['product_title'].map(lambda x: len(homedepotTokeniser(x)))
            train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_product_title']], how='left',
                                      on='product_uid')
            product_df['len_product_description'] = product_df['product_description'].map(lambda x: len(homedepotTokeniser(x)))
            train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_product_description']], how='left',
                                      on='product_uid')
            product_df['len_brand'] = product_df['product_brand'].map(lambda x: len(homedepotTokeniser(x)))
            train_query_df = pd.merge(train_query_df, product_df[['product_uid', 'len_brand']], how='left',
                                      on='product_uid')
            train_query_df['len_search_term'] = train_query_df['search_term'].map(lambda x: len(homedepotTokeniser(x)))

        if features.find("pmi") != -1:
            print("===========Performing pmi computation....this may take a while")
            timetracker.startTimeTrack()
            print(list(product_df))
            product_df['content'] = product_df['product_title'].map(str) + " " + \
                                    product_df['product_description'].map(str)

            timetracker.checkpointTimeTrack()

            print("Adding training query for that product id into the content")

            product_df = product_df.reset_index(drop=True)
            counter = 0
            for index, product in product_df.iterrows():
                # print("product:", product)
                productId = product['product_uid']
                # print("productId:",productId)
                df = train_query_df[train_query_df.product_uid == productId]
                # print("df:",df)
                searchterms = ""
                for index, row in df.iterrows():
                    searchterm = row['search_term']
                    searchterms = searchterms + " " + searchterm

                newString = product_df.iloc[counter]['content'] + " " + searchterms
                product_df.set_value(counter, 'content', newString)

                counter = counter + 1
            timetracker.checkpointTimeTrack()

            # Creating content
            text = product_df['content'].str.cat(sep=' ')
            pmiFeature = Feature_PMI.Feature_PMI(text)
            # print("PMI 'kitchen','cabinet': ", pmiFeature.computePMI('kitchen', 'cabinet'))
            train_query_df = pmiFeature.computePMIColumn(trainset=train_query_df)
            # print(list(train_query_df), "\n", train_query_df['pmi'])
            # train_query_df.filter(items=['id', 'pmi']).to_csv('pmi_features.csv')

        print("train_query_df final column:\n", train_query_df.info())

        return train_query_df
Esempio n. 4
0
def exeFMBidModel(testDF=None, validateDF=None, trainDF=None, trainReader=None, validationReader=None, testReader=None, writeResult2CSV=False):
    print("============ Factorisation Machine bid model....setting up")

    timer = Utility()
    timer.startTimeTrack()

    print("Getting encoded datasets")
    trainOneHotData, trainY = trainReader.getOneHotData()
    validationOneHotData, valY = validationReader.getOneHotData(train_cols=trainOneHotData.columns.get_values().tolist())
    testOneHotData, testY = testReader.getOneHotData(train_cols=trainOneHotData.columns.get_values().tolist())
    timer.checkpointTimeTrack()

    print("trainOneHotData:",trainOneHotData.shape,list(trainOneHotData))
    print("trainY:", trainY.shape, list(trainY))
    print("validationOneHotData:",validationOneHotData.shape,list(validationOneHotData))
    print("valY:", valY.shape, list(valY))

    fmBidModel=FMBidModel.FMBidModel(cBudget=6250 * 1000, modelType='fmclassificationsgd')
    print("==========Training starts")
    # fmBidModel.gridSearchandCrossValidateFastSGD(trainOneHotData, trainY)
    # timer.checkpointTimeTrack()

    fmBidModel.trainModel(trainOneHotData,trainY, retrain=True, modelFile="data.pruned/fmclassificationsgd.pkl")
    timer.checkpointTimeTrack()

    print("==========Validation starts")
    predictedProb=fmBidModel.validateModel(validationOneHotData, valY)
    timer.checkpointTimeTrack()

    # print("==========Bid optimisation starts")
    # fmBidModel.optimiseBid(validationOneHotData,valY)
    # timer.checkpointTimeTrack()

    # best score      0.3683528286042599
    # noBidThreshold  2.833333e-01
    # minBid          2.000000e+02
    # bidRange        9.000000e+01
    # sigmoidDegree - 1.000000e+01
    # won             3.432900e+04
    # click           1.380000e+02
    # spend           2.729869e+06
    # trimmed_bids    0.000000e+00
    # CTR             4.019925e-03
    # CPM             7.952078e+04
    # CPC             1.978166e+04
    # blended_score   3.683528e-01

    # best score      0.3681133881545131
    # noBidThreshold  2.833333e-01
    # minBid          2.000000e+02
    # bidRange        1.000000e+02
    # sigmoidDegree - 1.000000e+01
    # won             3.449900e+04
    # click           1.380000e+02
    # spend           2.758561e+06
    # trimmed_bids    0.000000e+00
    # CTR             4.000116e-03
    # CPM             7.996061e+04
    # CPC             1.998957e+04
    # blended_score   3.681134e-01


    # New budget      6250000
    # FM
    # best score      0.32755084132163526
    # noBidThreshold  8.666667e-01
    # minBid          2.000000e+02
    # bidRange        2.500000e+02
    # sigmoidDegree - 1.000000e+01
    # won             1.461000e+04
    # click           1.170000e+02
    # spend           1.124960e+06
    # trimmed_bids    0.000000e+00
    # CTR             8.008214e-03
    # CPM             7.699932e+04
    # CPC             9.615043e+03
    # blended_score   3.275508e-01

    # print("==========Getting  bids")
    ## 25000 budget
    # bidIdPriceDF=fmBidModel.getBidPrice(validationOneHotData,valY,noBidThreshold=0.2833333,minBid=200,bidRange=100,sigmoidDegree=-10)
    ## 6250 budget
    # bidIdPriceDF=fmBidModel.getBidPrice(validationOneHotData,valY,noBidThreshold=0.8666667,minBid=200,bidRange=250,sigmoidDegree=-10)
    # print("bidIdPriceDF:",bidIdPriceDF.shape, list(bidIdPriceDF))
    # bidIdPriceDF.to_csv("mybids.csv")
    # timer.checkpointTimeTrack()

    return predictedProb
Esempio n. 5
0
                    data=predicted)

        else:
            print("Error: No model was trained in this instance....")

        return predictedProb[:, 1]


if __name__ == "__main__":

    trainset = "data.final/train1_cleaned_prune.csv"
    validationset = "data.final/validation_cleaned.csv"
    testset = "data.final/test.csv"

    print("Reading dataset...")
    timer = Utility()
    timer.startTimeTrack()

    trainReader = ipinyouReader.ipinyouReader(trainset)
    validationReader = ipinyouReader.ipinyouReader(validationset)
    testReader = ipinyouReader.ipinyouReader(testset)
    timer.checkpointTimeTrack()
    print("Getting encoded datasets")
    trainOneHotData, trainY = trainReader.getOneHotData()
    validationOneHotData, valY = validationReader.getOneHotData(
        train_cols=trainOneHotData.columns.get_values().tolist())
    testOneHotData, testY = testReader.getOneHotData(
        train_cols=trainOneHotData.columns.get_values().tolist())
    timer.checkpointTimeTrack()

    print("trainOneHotData:", trainOneHotData.shape, list(trainOneHotData))