def func_pairs(features, group, num, rand_num, do_random, i_classes, model_num): # apply clean_df function to features m1 = group.copy() m1 = ci.clean_df(m1, features, primary, replace_space) if use_model: # how to make this work for second round? cosine_sim = ms.construct_similarity(m1, model_num, combine) else: # BEGINNING ------------------------------------------------------------ m1 = m1.assign(score=[''] * len(m1)) for feature in features: if feature in weights: for i in range(weights[feature]): m1['score'] = m1['score'] + " " + m1[feature] else: m1['score'] = m1['score'] + " " + m1[feature] #Construct the required TF-IDF matrix by fitting and transforming the data tfidf_matrix = tfidf.fit_transform(m1['score']) # Compute the cosine similarity matrix cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # END ----------------------------------------------------------------- #Construct a reverse map of indices and employee names indices = pd.Series(group.index, index=group['index']).drop_duplicates() return get_pairs(group['index'].sample(frac=1), indices, cosine_sim, group, num, rand_num, do_random)
def func_pairs(features, group, num, rand_num, do_random): # apply clean_df function to features m1 = group.copy() m1 = ci.clean_df(m1, features, replace_space, replace_key) if m1.empty: return [] # BEGINNING ------------------------------------------------------------ m1 = m1.assign(score = [''] * len(m1)) for feature in features: if feature in weights: for i in range(weights[feature]): m1['score'] = m1['score'] + " " + m1[feature] else: m1['score'] = m1['score'] + " " + m1[feature] #to_add = m1[[feature]*weights[feature]].apply(lambda x: ' '.join(x), axis=1) #m1['score'] = m1['score'].str.cat(to_add, sep=" ", na_rep = "") print(m1) #Construct the required TF-IDF matrix by fitting and transforming the data tfidf_matrix = tfidf.fit_transform(m1['score']) # Compute the cosine similarity matrix cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # END ----------------------------------------------------------------- #Construct a reverse map of indices and employee names indices = pd.Series(group.index, index=group['index']).drop_duplicates() return get_pairs(group['index'].sample(frac=1), indices, cosine_sim, group, num, rand_num, do_random)
def create_model(model_num): metadata = pd.read_csv(csv) m0 = metadata[features].reset_index() m0 = ci.clean_df(m0, features, primary, classes) ''' # Load training data training = pd.read_csv(training_csv) metadata = training[['group','target]] ''' # will later change to data in training_csv if model_num == 1: data = [["Maya; Maia", 2], ["Maya; Stanley", 3], ["Evan; Jen", 5],["Jordyn; Tom", 7]] allfeatures = training_features elif model_num == 2: data = [["Maya, Maia; Evan, Jen", 2],["Jordyn, Tom; Rebecca, Frank", 7]] allfeatures = second_features metadata = pd.DataFrame(data, columns=['group','target']) # modify df to get X df = metadata.drop(output, axis=1) # get just features y = metadata[output] # get target values # df is features to be predicted # m0 is information about people df_soup = load_prediction(df, m0, model_num, combine) # don't need feature scaling, all similarity numbers are between 0 and 1 X = df_soup[[x for x in allfeatures if x != primary]] # later on don't split the dataset, just use X and y for fit # 20% of data goes into test set, 80% into training set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # use regression instead of classifier? # change metrics to increase accuracy, need more data for supervised learning cl = RandomForestClassifier(n_estimators=900, n_jobs=-1) # select model to create cl.fit(X_train, y_train) # rfaccur = cl.score(X_test, y_test) # print(rfaccur) ''' # add when have real data to create model from feature_imp = pd.Series(clf.feature_importances_,index=iris.feature_names).sort_values(ascending=False) print(feature_imp) # feature importance ''' filename = model_choice[model_num] # save the model (cl) to disk pickle.dump(cl, open(filename, 'wb')) return None
def speed_up_pairings(features, m0, num, rand_num, do_random, i_classes, model_num, combine): m1 = m0.copy() m1 = ci.clean_df(m1, features, primary, i_classes) if use_index: names = list(m1['index']) else: names = list(m1['Name']) data = [['-'] * (len(features) + 1)] extra = pd.DataFrame(data, columns=features + ['index']) # pairs = pd.DataFrame(columns=features + ['index']) pairs = [] already_paired = [] not_paired = names for n in names: if n not in already_paired: pair = [] not_paired.remove(n) already_paired.append(n) if use_index: # pairs = pairs.append(m0[m0['index'] == n].iloc[0]) pair.append(n) else: n_index = m1['index'][m1['Name'] == n].iloc[0] # pairs = pairs.append(m0[m0['index'] == n_index].iloc[0]) pair.append(n_index) data = [[str(n) + "; " + str(x)] if use_index else [n + "; " + x] for x in not_paired] df = pd.DataFrame(data, columns=['group']) df_soup = ms.load_prediction(df, m1, model_num, combine) if model_num == 1: t_features = training_features elif model_num == 2: t_features = second_features X_test = df_soup[[x for x in t_features if x != primary]] predictions = ms.make_prediction(df_soup, X_test, model_num) # already sorted group_sims = [] if do_random: group_num = rand_num else: group_num = num + 1 for p in predictions.iterrows(): if (len(group_sims) == group_num): break n, x = (p[1]['Name']).split("; ") group_sims.append(x) if (len(group_sims) > num): if do_random: result = random.sample(group_sims, num - 1) else: result = group_sims[:num - 1] else: result = group_sims if use_index: matches = [int(x) for x in result] not_paired = [x for x in not_paired if x not in matches] already_paired += matches else: already_paired += result not_paired = [x for x in not_paired if x not in result] matches = [ m1['index'][m1['Name'] == m].iloc[0] for m in result ] for m in matches: # pairs = pairs.append(m0[m0['index'] == m].iloc[0]) pair.append(m) # pairs = pd.concat([pairs, extra], sort=False) pairs.append(pair) return pairs