def func_pairs(features, group, num, rand_num, do_random, i_classes,
               model_num):
    # apply clean_df function to features
    m1 = group.copy()
    m1 = ci.clean_df(m1, features, primary, replace_space)

    if use_model:  # how to make this work for second round?
        cosine_sim = ms.construct_similarity(m1, model_num, combine)
    else:
        # BEGINNING ------------------------------------------------------------
        m1 = m1.assign(score=[''] * len(m1))
        for feature in features:
            if feature in weights:
                for i in range(weights[feature]):
                    m1['score'] = m1['score'] + " " + m1[feature]
            else:
                m1['score'] = m1['score'] + " " + m1[feature]

        #Construct the required TF-IDF matrix by fitting and transforming the data
        tfidf_matrix = tfidf.fit_transform(m1['score'])

        # Compute the cosine similarity matrix
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
        # END -----------------------------------------------------------------

    #Construct a reverse map of indices and employee names
    indices = pd.Series(group.index, index=group['index']).drop_duplicates()

    return get_pairs(group['index'].sample(frac=1), indices, cosine_sim, group,
                     num, rand_num, do_random)
Beispiel #2
0
def func_pairs(features, group, num, rand_num, do_random):
    # apply clean_df function to features
    m1 = group.copy()
    m1 = ci.clean_df(m1, features, replace_space, replace_key)
    if m1.empty:
        return []
    # BEGINNING ------------------------------------------------------------
    m1 = m1.assign(score = [''] * len(m1))
    for feature in features:
        if feature in weights:
            for i in range(weights[feature]):
                m1['score'] = m1['score'] + " " + m1[feature]
        else:
            m1['score'] = m1['score'] + " " + m1[feature]
        #to_add = m1[[feature]*weights[feature]].apply(lambda x: ' '.join(x), axis=1)
        #m1['score'] = m1['score'].str.cat(to_add, sep=" ", na_rep = "")
        
    print(m1)
    #Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(m1['score'])

    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    # END -----------------------------------------------------------------
    
    #Construct a reverse map of indices and employee names
    indices = pd.Series(group.index, index=group['index']).drop_duplicates()

    return get_pairs(group['index'].sample(frac=1), indices, cosine_sim, group, num, rand_num, do_random)
def create_model(model_num):
    metadata = pd.read_csv(csv)
    m0 = metadata[features].reset_index()
    m0 = ci.clean_df(m0, features, primary, classes)

    '''
    # Load training data
    training = pd.read_csv(training_csv)
    metadata = training[['group','target]]
    '''
    # will later change to data in training_csv
    if model_num == 1:
        data = [["Maya; Maia", 2], ["Maya; Stanley", 3], ["Evan; Jen", 5],["Jordyn; Tom", 7]]
        allfeatures = training_features
    elif model_num == 2:
        data = [["Maya, Maia; Evan, Jen", 2],["Jordyn, Tom; Rebecca, Frank", 7]]
        allfeatures = second_features
    metadata = pd.DataFrame(data, columns=['group','target'])

    # modify df to get X
    df = metadata.drop(output, axis=1) # get just features
    y = metadata[output] # get target values

    # df is features to be predicted
    # m0 is information about people
    df_soup = load_prediction(df, m0, model_num, combine)

    # don't need feature scaling, all similarity numbers are between 0 and 1
    X = df_soup[[x for x in allfeatures if x != primary]]

    # later on don't split the dataset, just use X and y for fit
    # 20% of data goes into test set, 80% into training set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    # use regression instead of classifier?
    # change metrics to increase accuracy, need more data for supervised learning
    cl = RandomForestClassifier(n_estimators=900, n_jobs=-1) # select model to create
    cl.fit(X_train, y_train)
    # rfaccur = cl.score(X_test, y_test)
    # print(rfaccur)

    '''
    # add when have real data to create model from
    feature_imp = pd.Series(clf.feature_importances_,index=iris.feature_names).sort_values(ascending=False)
    print(feature_imp) # feature importance
    '''
    filename = model_choice[model_num]
    # save the model (cl) to disk
    pickle.dump(cl, open(filename, 'wb'))
    return None 
Beispiel #4
0
def speed_up_pairings(features, m0, num, rand_num, do_random, i_classes,
                      model_num, combine):

    m1 = m0.copy()
    m1 = ci.clean_df(m1, features, primary, i_classes)

    if use_index:
        names = list(m1['index'])
    else:
        names = list(m1['Name'])

    data = [['-'] * (len(features) + 1)]
    extra = pd.DataFrame(data, columns=features + ['index'])

    # pairs = pd.DataFrame(columns=features + ['index'])
    pairs = []
    already_paired = []
    not_paired = names
    for n in names:
        if n not in already_paired:
            pair = []
            not_paired.remove(n)
            already_paired.append(n)

            if use_index:
                # pairs = pairs.append(m0[m0['index'] == n].iloc[0])
                pair.append(n)
            else:
                n_index = m1['index'][m1['Name'] == n].iloc[0]
                # pairs = pairs.append(m0[m0['index'] == n_index].iloc[0])
                pair.append(n_index)

            data = [[str(n) + "; " + str(x)] if use_index else [n + "; " + x]
                    for x in not_paired]
            df = pd.DataFrame(data, columns=['group'])

            df_soup = ms.load_prediction(df, m1, model_num, combine)
            if model_num == 1:
                t_features = training_features
            elif model_num == 2:
                t_features = second_features
            X_test = df_soup[[x for x in t_features if x != primary]]
            predictions = ms.make_prediction(df_soup, X_test,
                                             model_num)  # already sorted

            group_sims = []
            if do_random: group_num = rand_num
            else: group_num = num + 1
            for p in predictions.iterrows():
                if (len(group_sims) == group_num): break
                n, x = (p[1]['Name']).split("; ")
                group_sims.append(x)

            if (len(group_sims) > num):
                if do_random:
                    result = random.sample(group_sims, num - 1)
                else:
                    result = group_sims[:num - 1]
            else:
                result = group_sims

            if use_index:
                matches = [int(x) for x in result]
                not_paired = [x for x in not_paired if x not in matches]
                already_paired += matches
            else:
                already_paired += result
                not_paired = [x for x in not_paired if x not in result]
                matches = [
                    m1['index'][m1['Name'] == m].iloc[0] for m in result
                ]

            for m in matches:
                # pairs = pairs.append(m0[m0['index'] == m].iloc[0])
                pair.append(m)

            # pairs = pd.concat([pairs, extra], sort=False)
            pairs.append(pair)

    return pairs