コード例 #1
0
 def state_null(col, state):
     if state is None:
         if col.startswith("No PORT"):
             return "NPRT"
         elif col.startswith("Collapsed"):
             return "CPRT"
         elif "UNKOWN" in col or "UNIDENTIFIED" in col:
             return "UNKWN"
         elif "WASHINGTON DC" in col:
             return "DC"
         elif "MARIPOSA AZ" in col:
             return "AZ"
     else:
         return state
コード例 #2
0
    def create_tag_frequencies(self, dataframe):
        """Produces a PySpark dataframe containing a column representing the total frequency of the tags by record.

        The frequency of tags is determined by their proportion of the total number of tags in the dataframe.

        :param dataframe: the PySpark dataframe
        :returns: the PySpark dataframe containing the tag frequency field and all fields in the supplied dataframe
        """
        df_tags = dataframe.selectExpr("tag1 AS tag").union(dataframe.selectExpr("tag2 AS tag")).union(dataframe.selectExpr("tag3 AS tag")) \
                           .union(dataframe.selectExpr("tag4 AS tag")).union(dataframe.selectExpr("tag5 AS tag"))
        df_tags = df_tags.na.drop(subset=["tag"])
        tags_total_count = df_tags.count()
        print("Total number of tags used, including duplicates:",tags_total_count)
        df_tag_freq = df_tags.groupBy("tag").count().orderBy(desc("count"))
        df_tag_freq = df_tag_freq.withColumn("frequency", col("count")/tags_total_count)
        df_tag_freq.orderBy(desc("frequency")).show(10)

        def one_hot_encode_top_n_tags(dataframe,n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_"+tag).replace(".","dot")
                dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe

        dataframe = one_hot_encode_top_n_tags(dataframe,20)
        tag_columns = [col for col in dataframe.columns if col.startswith('tag')]

        print("Tag-related columns")
        dataframe.select(tag_columns).show(10,False)

        dataframe.createOrReplaceTempView('df')
        df_tag_freq.createOrReplaceTempView('df_tag_freq')

        for n in range(1,6):
            dataframe = self.sqlContext.sql("SELECT df.*, df_tag_freq.frequency AS frequency_tag{} FROM df LEFT JOIN df_tag_freq ON df.tag{} = df_tag_freq.tag".format(n,n))
            dataframe = dataframe.na.fill({"frequency_tag{}".format(n): 0})
            dataframe.createOrReplaceTempView('df')

        dataframe = dataframe.withColumn("frequency_sum", col("frequency_tag1")+col("frequency_tag2")+col("frequency_tag3")+col("frequency_tag4")+col("frequency_tag5"))

        # Remove temporary columns
        dataframe = dataframe.select([c for c in dataframe.columns if c not in {"tags_split","tag1","tag2","tag3","tag4","tag5","frequency_tag1","frequency_tag2", \
                                      "frequency_tag3","frequency_tag4","frequency_tag5"}])
        return(dataframe)
コード例 #3
0
    def create_tag_frequencies(self, dataframe):
        """Produces a PySpark dataframe containing a column representing the total frequency of the tags by record.

        The frequency of tags is determined by their proportion of the total number of tags in the dataframe.

        :param dataframe: the PySpark dataframe
        :returns: the PySpark dataframe containing the tag frequency field and all fields in the supplied dataframe
        """
        df_tags = dataframe.selectExpr("tag1 AS tag").union(dataframe.selectExpr("tag2 AS tag")).union(dataframe.selectExpr("tag3 AS tag")) \
                           .union(dataframe.selectExpr("tag4 AS tag")).union(dataframe.selectExpr("tag5 AS tag"))
        df_tags = df_tags.na.drop(subset=["tag"])
        tags_total_count = df_tags.count()
        print("Total number of tags used, including duplicates:",
              tags_total_count)
        df_tag_freq = df_tags.groupBy("tag").count().orderBy(desc("count"))
        df_tag_freq = df_tag_freq.withColumn("frequency",
                                             col("count") / tags_total_count)
        df_tag_freq.orderBy(desc("frequency")).show(10)

        def one_hot_encode_top_n_tags(dataframe, n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [
                t.tag for t in df_tag_freq.orderBy(desc("frequency")).select(
                    "tag").limit(n).collect()
            ]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_" + tag).replace(".", "dot")
                dataframe = dataframe.withColumn(
                    tag_column_name,
                    array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe

        dataframe = one_hot_encode_top_n_tags(dataframe, 20)
        tag_columns = [
            col for col in dataframe.columns if col.startswith('tag')
        ]

        print("Tag-related columns")
        dataframe.select(tag_columns).show(10, False)

        dataframe.createOrReplaceTempView('df')
        df_tag_freq.createOrReplaceTempView('df_tag_freq')

        for n in range(1, 6):
            dataframe = self.sqlContext.sql(
                "SELECT df.*, df_tag_freq.frequency AS frequency_tag{} FROM df LEFT JOIN df_tag_freq ON df.tag{} = df_tag_freq.tag"
                .format(n, n))
            dataframe = dataframe.na.fill({"frequency_tag{}".format(n): 0})
            dataframe.createOrReplaceTempView('df')

        dataframe = dataframe.withColumn(
            "frequency_sum",
            col("frequency_tag1") + col("frequency_tag2") +
            col("frequency_tag3") + col("frequency_tag4") +
            col("frequency_tag5"))

        # Remove temporary columns
        dataframe = dataframe.select([c for c in dataframe.columns if c not in {"tags_split","tag1","tag2","tag3","tag4","tag5","frequency_tag1","frequency_tag2", \
                                      "frequency_tag3","frequency_tag4","frequency_tag5"}])
        return (dataframe)
コード例 #4
0
def group_df_into_struct(df: DataFrame, colfamily: str, key: str) -> DataFrame:
    """Group columns of a df into a struct column

    *Note*
    Currently, the dataframe is transformed by splitting it into
    two dataframes, reshaping one of them and then using a join.
    This might consume more resources than necessary and should be
    optimized in the future if required.

    If we have a df with the following schema:
    root
     |-- objectId: string (nullable = true)
     |-- candidate_ra: double (nullable = true)
     |-- candidate_dec: double (nullable = true)

    and we want to group all `candidate_*` into a struct like:
    root
     |-- objectId: string (nullable = true)
     |-- candidate: struct (nullable = false)
     |    |-- ra: double (nullable = true)
     |    |-- dec: double (nullable = true)

    Parameters
    ----------
    df: Spark DataFrame
        a Spark dataframe with flat columns
    colfamily: str
        prefix of columns to be grouped into a struct
    key: str
        a column with unique values (used for join)

    Returns
    ----------
    df: Spark DataFrame
        a Spark dataframe with columns grouped into struct

    Examples
    ----------
    >>> df = spark.sparkContext.parallelize(zip(
    ...     ["ZTF18aceatkx", "ZTF18acsbjvw"],
    ...     [697251923115015002, 697251921215010004],
    ...     [20.393772, 20.4233877],
    ...     [-25.4669463, -27.0588511],
    ...     ["Star", "Unknown"])).toDF([
    ...       "objectId", "candid", "candidate_ra",
    ...       "candidate_dec", "cross_match_alerts_per_batch"])
    >>> df.printSchema()
    root
     |-- objectId: string (nullable = true)
     |-- candid: long (nullable = true)
     |-- candidate_ra: double (nullable = true)
     |-- candidate_dec: double (nullable = true)
     |-- cross_match_alerts_per_batch: string (nullable = true)
    <BLANKLINE>

    >>> df = group_df_into_struct(df, 'candidate', 'objectId')
    >>> df.printSchema()
    root
     |-- objectId: string (nullable = true)
     |-- candid: long (nullable = true)
     |-- cross_match_alerts_per_batch: string (nullable = true)
     |-- candidate: struct (nullable = false)
     |    |-- ra: double (nullable = true)
     |    |-- dec: double (nullable = true)
    <BLANKLINE>

    """
    struct_cols = []
    flat_cols = []

    pos = len(colfamily) + 1

    for col in df.columns:
        if col.startswith(colfamily + "_"):
            struct_cols.append(col)
        else:
            flat_cols.append(col)

    # dataframe with columns other than 'columnFamily_*'
    df1 = df.select(flat_cols)

    new_col_names = []
    new_col_names.append(key)

    # dataframe with key + 'columnFamily_*'
    df2 = df.select(new_col_names + struct_cols)

    struct_cols = [x[pos:] for x in struct_cols]

    new_col_names.extend(struct_cols)
    df2_renamed = df2.toDF(*new_col_names)

    # Group 'columnFamily_*' into a struct
    df2_struct = df2_renamed.select(key, struct(*struct_cols).alias(colfamily))

    # join the two dataframes based on 'key'
    df_new = df1.join(df2_struct, key)

    return df_new
コード例 #5
0
def preprocess_files(spark):
    lyrics_file = 'hdfs:/user/yh2857/lyrics_processed.parquet'
    features_file = 'hdfs:/user/yh2857/features.parquet'
    metadata_file = 'hdfs:/user/yh2857/metadata.parquet'
    tags_file = 'hdfs:/user/yh2857/tags.parquet'

    tags = spark.read.parquet(tags_file)
    tags_agg = tags.groupby('item_index').agg(
        collect_list('tag').alias('tags'))
    cv = CountVectorizer(inputCol="tags",
                         outputCol="keywords",
                         vocabSize=5000,
                         minDF=2.0)
    model = cv.fit(tags_agg)
    tags_agg = model.transform(tags_agg).withColumnRenamed(
        'item_index', 'tags_item_index')
    tags_agg.limit(5).show()
    print("tags feature info ", tags_agg.count(), tags_agg)
    # tmp = tags.groupBy("tag").count().sort(desc("count"))
    # feature_words = tmp.where(col('count') >= 100).select('tag')
    # huge_df = tags.join(feature_words, feature_words.tag == tags.tag, "left_outer")

    metadata = spark.read.parquet(metadata_file)
    indexer = StringIndexer(inputCol="artist_id", outputCol="artist_idx")
    metadata = indexer.fit(metadata).transform(metadata).withColumnRenamed(
        'item_index', 'meta_item_index')
    metadata.limit(5).show()
    print("metadata features info ", metadata.count(), metadata)

    features = spark.read.parquet(features_file)
    features.limit(5).show()
    print("features data info ", features.count(), features)

    lyrics = spark.read.parquet(lyrics_file).withColumnRenamed(
        'item_index', 'lyrics_item_index')
    lyrics.limit(5).show()
    print("lyrics data info ", lyrics.count(), lyrics)

    df = tags_agg.join(metadata,
                       tags_agg.tags_item_index == metadata.meta_item_index,
                       'inner')
    df = df.join(features, df.tags_item_index == features.item_index, 'inner')
    df = df.join(lyrics, df.tags_item_index == lyrics.lyrics_item_index,
                 'inner')
    df = df.select("item_index", 'keywords', 'artist_idx', 'year',
                   'artist_hotttnesss', 'artist_familiarity', 'duration',
                   'countvector', 'loudness_mean', 'loudness_std', 'timbre_00',
                   'timbre_01', 'timbre_02', 'timbre_03', 'timbre_04',
                   'timbre_05', 'timbre_06', 'timbre_07', 'timbre_08',
                   'timbre_09', 'timbre_10', 'timbre_11')

    filter_col = [col for col in features.columns if col.startswith('timbre')]
    filter_col += ([
        'loudness_std', 'loudness_mean', 'duration', 'artist_hotttnesss',
        'artist_familiarity', 'year'
    ])

    df_assembler = VectorAssembler(
        inputCols=filter_col,
        outputCol="features_tmp").setHandleInvalid('skip')
    scaler = StandardScaler(inputCol="features_tmp",
                            outputCol="features",
                            withStd=True,
                            withMean=False)
    tmp = df_assembler.transform(df)
    scalerModel = scaler.fit(tmp)
    fitted = scalerModel.transform(tmp)
    fitted = fitted.select("item_index", "features")
    fitted.limit(5).show()
    print(fitted.count(), fitted)

    filter_col += (['countvector', 'keywords'])
    df_assembler = VectorAssembler(
        inputCols=filter_col,
        outputCol="features_tmp").setHandleInvalid('skip')
    scaler = StandardScaler(inputCol="features_tmp",
                            outputCol="features",
                            withStd=True,
                            withMean=False)
    tmp = df_assembler.transform(df)
    scalerModel = scaler.fit(tmp)
    fitted2 = scalerModel.transform(tmp)
    fitted2 = fitted2.select("item_index", "features")
    fitted2.limit(5).show()
    print(fitted2.count(), fitted2)

    return fitted, fitted2
コード例 #6
0
def getvalues_and_recommend():
    userid = 2552
    shop1 = request.form['shop1']
    rate1 = float(request.form['rate1'])
    shop2 = request.form['shop2']
    rate2 = float(request.form['rate2'])
    shop3 = request.form['shop3']
    rate3 = float(request.form['rate3'])
    shop4 = request.form['shop4']
    rate4 = float(request.form['rate4'])
    shop5 = request.form['shop5']
    rate5 = float(request.form['rate5'])
    shop6 = request.form['shop6']
    rate6 = float(request.form['rate6'])
    shop7 = request.form['shop7']
    rate7 = float(request.form['rate7'])
    shop8 = request.form['shop8']
    rate8 = float(request.form['rate8'])
    shop9 = request.form['shop9']
    rate9 = float(request.form['rate9'])
    shop10 = request.form['shop10']
    rate10 = float(request.form['rate10'])

    #creating a new spark session
    newspark = SparkSession.builder.appName('hybrid_rec').getOrCreate()
    #reading in prepped dataset for model-based collaborative filtering recommendation
    mbcf = newspark.read.csv('mbcf.csv', header=True, inferSchema=True)
    #making a copy for each new user input
    mbcf_try = mbcf
    vals = [(shop1,rate1,userid),(shop2,rate2,userid),(shop3,rate3,userid),(shop4,rate4,userid),(shop5,rate5,userid),(shop6,rate6,userid),(shop7,rate7,userid),(shop8,rate8,userid),(shop9,rate9,userid),(shop10,rate10,userid)]
    #pyspark's convention to adding new rows to the end of an existing spark dataframe-1
    newRows = newspark.createDataFrame(vals,mbcf_try.columns)
    #pyspark's convention to adding new rows to the end of an existing spark dataframe-2
    mbcf_try = mbcf_try.union(newRows)
    #converting df to pandas df for easier manipulation later on...
    mbcf_try_pd = mbcf_try.toPandas()
    #getting a look again at the outlets and ratings provided by userid2552 so we know which outlets to exclude in recommending outlets to userid2552 later on...
    user_item_2552 = mbcf_try_pd[mbcf_try_pd['userids']==2552]
    #as part of ALS requirements for the feature columns to be in numerical format, am converting both shops and userids to the double precision format just in case (even though userids is already in a float format)
    indexer_try = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(mbcf_try.columns)-set(['ratings']))]
    pipeline_try = PL(stages=indexer_try)
    transformed_try = pipeline_try.fit(mbcf_try).transform(mbcf_try)
    #rank=300 and regParam=0.1 was a pair of tuned best params while retuning als with train test split stratified for userids...
    als = ALS(rank=300, regParam=0.1, maxIter=20, seed=42, userCol='userids_index',itemCol='shops_index', ratingCol='ratings',coldStartStrategy='drop')
    #training the dataset containing the new user's ratings...
    als_model_rec = als.fit(transformed_try)
    #making recommendations for model-based collaborative filtering alone first, passing in all 981 outlets so as to ensure as much overlap between collaborative filtering and content-based filtering in the outlets that they generate rating predictions for
    recs=als_model_rec.recommendForAllUsers(981).toPandas()
    nrecs=recs.recommendations.apply(pd.Series) \
                .merge(recs, right_index = True, left_index = True) \
                .drop(["recommendations"], axis = 1) \
                .melt(id_vars = ['userids_index'], value_name = "recommendation") \
                .drop("variable", axis = 1) \
                .dropna()
    nrecs=nrecs.sort_values('userids_index')
    nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['userids_index']], axis = 1)
    nrecs.columns = [

            'Shop_index',
            'Rating',
            'UserID_index'

         ]
    md=transformed_try.select(transformed_try['userids'],transformed_try['userids_index'],transformed_try['shops'],transformed_try['shops_index'])
    md=md.toPandas()
    dict1=dict(zip(md['userids_index'],md['userids']))
    dict2=dict(zip(md['shops_index'],md['shops']))
    nrecs['UserID']=nrecs['UserID_index'].map(dict1)
    nrecs['shops']=nrecs['Shop_index'].map(dict2)
    nrecs=nrecs.sort_values('UserID')
    nrecs.reset_index(drop=True, inplace=True)
    new=nrecs[['UserID','shops','Rating']]
    new['recommendations'] = list(zip(new.shops, new.Rating))
    res=new[['UserID','recommendations']]
    res_new=res['recommendations'].groupby([res.UserID]).apply(list).reset_index()

    #creating a new df for userid2552's collaborative filtering-derived recommendations
    collab_rec_2552 = pd.DataFrame(dict(res_new[res_new["UserID"]==2552]['recommendations'].tolist()[0]),index=[0]).T.sort_values(0,ascending=False)

    #creating a list of outlets userid2552 has rated earlier on
    rated_2552 = mbcf_try_pd[mbcf_try_pd['userids']==2552]['shops'].tolist()

    #filtering out those 10 outlets userid2552 has rated initially from the collaborative filtering recommendation list...
    collab_rankedrecs_2552 = collab_rec_2552.loc[[shop for shop in collab_rec_2552.index if shop not in rated_2552],0]

    #organizing the above series column into a df of recommendations and collaborative filtering rating predictions
    collab_2552_df = pd.DataFrame({'recommendations':collab_rankedrecs_2552.index,'collab_filter_predicted_ratings':collab_rankedrecs_2552})

    #reading in the previously prepped df meant for content-based filtering here for content-based filtering recommendations..
    content_f = pd.read_csv('content_based_df_nouser.csv')

    #merging userid2552's info with the df meant for content-based filtering so that rcontent-based filtering can make recommendations via rating predictions for userid 2552 later on...
    content_2552 = pd.merge(content_f,user_item_2552,how='left',on='shops')

    #getting dummies for categorical columns...
    content_2552_wdummies = pd.get_dummies(content_2552, columns=['shops','category_alias'], drop_first=False)

    #setting feature and target
    X = content_2552_wdummies.drop(['ratings'], axis=1)
    y = content_2552_wdummies['ratings']

    #collating dummified columns
    shops_cats_list = [col for col in content_2552_wdummies.columns if (col.startswith('shops')) or (col.startswith('category'))]

    #extending with review_count and rating
    shops_cats_list.extend(['review_count','rating','userids'])

    #as tfidf can only work on one column of texts at a time, am separating features as below...
    X1 = X['reviews']
    X2 = X[shops_cats_list]

    #Assigning a new variable name to X1 for processing.
    rev = X1

    #creating customized stop words' list
    cust_stop_words = [word for word in stop_words.ENGLISH_STOP_WORDS]

    #adding on to the above list based on preliminary word cloud EDA
    cust_stop_words.extend(["wa","ha","just","ve","did","got","quite"])

    #preprocessing text in reviews by defining a function to do so
    lemm = WordNetLemmatizer()

    def text_processer(raw_text):
        # Function to convert a raw string of text to a string of words
        # The input is a single string (a raw unprocessed text), and
        # the output is a single string (a preprocessed text)

        # 1. Remove http urls.
        review_text = re.sub("\(http.+\)", " ", raw_text)

        # 2. Remove non-letters.
        letters_only = re.sub("[^a-zA-Z]", " ", review_text)

        # 3. Convert to lower case, split into individual words.
        words = letters_only.lower().split()

        # 4. Lemmatize words.
        lemmed_words = [lemm.lemmatize(i) for i in words]

        # 5. Remove stop words.

        meaningful_words = [w for w in lemmed_words if not w in cust_stop_words]

        # 6. Join the words back into one string separated by space,
        # and return the result.
        return(" ".join(meaningful_words))

    #showing how the processed reviews look like
    rev_processed = pd.Series([text_processer(text) for text in rev])

    #using tfidf vectorizer to convert the reviews into term frequency columns...
    tvec_naive = TfidfVectorizer(stop_words = cust_stop_words)  #instantiating TfidfVectorizer with customized stop words

    X1_tvec_naive = tvec_naive.fit_transform(rev_processed).todense()   #fitting tvec and transforming the processed reviews
    X1_tvec_naive_df = pd.DataFrame(X1_tvec_naive, columns = tvec_naive.get_feature_names())  #converting it into a dataframe for easy lookup.

    #combining tvec-df with the rest of the features for rating prediction for userid 2552 later on...
    X_legit = pd.concat([X1_tvec_naive_df,X2], axis=1)

    #adding back the column of ratings so that it can be dropped below-sorry sometimes my train of thought may sound illogical
    X_legit['ratings'] = y

    #creating X_train manually for userid 2552
    X_train_2552 = X_legit[X_legit['userids']==2552].drop(['ratings','userids'],axis=1)

    #creating y_train manually for userid 2552
    y_train_2552 = X_legit[X_legit['userids']==2552]['ratings']

    #creating X_test manually for userid 2552 which contains all outlets that have not been rated by userid 2552
    X_test_2552 = X_legit[X_legit['userids']!=2552].drop(['ratings','userids'],axis=1)

    #instantiate scaler since not all of the features are of the same scale, eg. review_count and rating
    ss= StandardScaler()

    #fitting the train and transforming both the train and test sets
    X_train_2552_sc = ss.fit_transform(X_train_2552)
    X_test_2552_sc = ss.transform(X_test_2552)

    #learning rate, max depth, and n_estimators were retrieved from a tuned xgb model (notebook on future plan for xgb) saved in the folder but in order to use random_state which was not used during tuning, I am just instantiating a new xgb instance with the 3 tuned hyperparams set accordingly...
    xgb = XGBClassifier(learning_rate=0.5, max_depth=9, n_estimators=200, random_state=42)

    #training the loaded model on the dataset containing the new user, userid 2552's ratings.
    xgb.fit(X_train_2552_sc, y_train_2552)

    #stacking X_test_2552 as first step in regenerating the shops column for predictions
    trial = X_test_2552.stack()

    #creating loop to re-generate original X_test_2552 order of shops
    index_lst = []
    outlets_lst = []
    for n in range(len(trial.index)):
        if trial.index[n][1].startswith('shops_') and trial[n]!=0:
            index_lst.append(str(trial.index[n][0]))
            outlets_lst.append(trial.index[n][1])
    index_lst = [int(x) for x in index_lst]
    reconstructed_X_test_2552 = pd.DataFrame({'shops':outlets_lst}, index=index_lst)

    #generating content-based filtering rating predictions for userid 2552
    rating_predictions = xgb.predict(X_test_2552_sc)

    #adding new column of rating predictions into the reconstructed X_test_2552
    reconstructed_X_test_2552['predicted_ratings']=rating_predictions

    #giving the reconstructed df a more easily understood name for distinction from the collaborative filtering df dealt with above
    content_2552_df = reconstructed_X_test_2552

    #trimming off the shops' prefixes so that they can eventually be merged with the collaborative filtering df
    content_2552_df['shops'] = content_2552_df['shops'].apply(lambda x: x[6:])

    #renaming the column of rating predictions to distinguish from collaborative filtering's prediction column later on when both dfs are merged.
    content_2552_df.rename(columns={'predicted_ratings':'content_filter_predicted_ratings'},inplace=True)

    #renaming collaborative filtering df's recommendations' column so that it can be merged with the content-based filtering df.
    collab_2552_df.rename(columns={'recommendations':'shops'},inplace=True)

    #reseting the index in the collaborative filtering df so that the index is numerical again
    collab_2552_df.reset_index(drop=True,inplace=True)

    #merging both content-based filtering and collaborating filtering df to prepare to make hybrid recommendations for userid 2552
    content_collab_2552_df = pd.merge(content_2552_df,collab_2552_df,how='inner',on='shops')

    #as mentioned in the previous sub-notebook on this hybrid recommender's evaluation, the following are the content-based and collaborative filtering's ratings' weights
    con_wt = 0.97 / (0.97 + 1.0)
    collab_wt = 1.0 / (0.97 + 1.0)

    #feature engineering to add hybrid recommender's rating predictions into the combined df by multiplying the respective rating predictions by weights based on both models' f1 scores derived from prior evaluation and summing them up to yield hybrid predictions
    content_collab_2552_df['final_weighted_rating_predictions'] = (content_collab_2552_df['content_filter_predicted_ratings']*con_wt) + (content_collab_2552_df['collab_filter_predicted_ratings']*collab_wt)

    #top 5 coffee-drinking outlet recommendations for userid 2552 (me!) based on my ratings given rather randomly to 10 of the outlets earlier on...
    #recommendations_top_5 = content_collab_2552_df.sort_values('final_weighted_rating_predictions',ascending=False).head()
    top_5_recs = content_collab_2552_df[['shops','final_weighted_rating_predictions']].sort_values('final_weighted_rating_predictions',ascending=False).head()
    top_5_recs.reset_index(drop=True,inplace=True)
    first = top_5_recs.loc[0,'shops']
    second = top_5_recs.loc[1,'shops']
    third = top_5_recs.loc[2,'shops']
    fourth = top_5_recs.loc[3,'shops']
    fifth = top_5_recs.loc[4,'shops']

    return render_template('outcome.html', first=first, second=second, third=third, fourth=fourth, fifth=fifth, shop1=shop1, rate1=rate1, shop2=shop2, rate2=rate2, shop3=shop3, rate3=rate3, shop4=shop4, rate4=rate4, shop5=shop5, rate5=rate5, shop6=shop6, rate6=rate6, shop7=shop7, rate7=rate7, shop8=shop8, rate8=rate8, shop9=shop9, rate9=rate9, shop10=shop10, rate10=rate10, url_alias=url_alias)