def get_model_probabilities_for_input_texts(text_array): """ 질문이 높은 점수를 받을 가능성을 나타내는 확률 점수의 배열을 반환합니다. 포맷: [ [prob_low_score1, prob_high_score_1], ... ] :param text_array: 점수를 매길 질문의 배열 :return: 예측 확률 배열 """ global FEATURE_ARR, VECTORIZER, MODEL vectors = VECTORIZER.transform(text_array) text_ser = pd.DataFrame(text_array, columns=["full_text"]) text_ser = add_v1_features(text_ser) vec_features = vstack(vectors) num_features = text_ser[FEATURE_ARR].astype(float) features = hstack([vec_features, num_features]) return MODEL.predict_proba(features)
def get_model_probabilities_for_input_texts(text_array): """ Returns an array of probability scores representing the likelihood of a question receiving a high score format is: [ [prob_low_score1, prob_high_score_1], ... ] :param text_array: array of questions to be scored :return: array of predicted probabilities """ global FEATURE_ARR, VECTORIZER, MODEL vectors = VECTORIZER.transform(text_array) text_ser = pd.DataFrame(text_array, columns=["full_text"]) text_ser = add_v1_features(text_ser) vec_features = vstack(vectors) num_features = text_ser[FEATURE_ARR].astype(float) features = hstack([vec_features, num_features]) return MODEL.predict_proba(features)
def generate_model_text_features(raw_df_path, save_path=None): """ A function to generate features for model 2 and save them to disk. These features take multiple minutes to compute :param raw_df_path: path to raw DataFrame (generated from parse_xml_to_csv) :param save_path: path to save processed DataFrame to :return: processed DataFrame """ df = pd.read_csv(raw_df_path) df = format_raw_df(df.copy()) df = df.loc[df["is_question"]].copy() df["full_text"] = df["Title"].str.cat(df["body_text"], sep=" ", na_rep="") df = add_v1_features(df.copy()) df = add_v2_text_features(df.copy()) if save_path: df.to_csv(save_path) return df
def generate_model_text_features(raw_df_path, save_path=None): """ 모델 2를 위한 특성을 생성하고 디스크에 저장하는 함수 이 특성을 계산하는데 몇 분 정도 걸립니다. :param raw_df_path: (parse_xml_to_csv에서 생성한) 원본 DataFrame 경로 :param save_path: 처리된 DataFrame을 저장할 경로 :return: 처리된 DataFrame """ df = pd.read_csv(raw_df_path) df = format_raw_df(df.copy()) df = df.loc[df["is_question"]].copy() df["full_text"] = df["Title"].str.cat(df["body_text"], sep=" ", na_rep="") df = add_v1_features(df.copy()) df = add_v2_text_features(df.copy()) if save_path: df.to_csv(save_path) return df