Example #1
0
def get_model_probabilities_for_input_texts(text_array):
    """
    질문이 높은 점수를 받을 가능성을 나타내는 확률 점수의 배열을 반환합니다.
    포맷: [ [prob_low_score1, prob_high_score_1], ... ]
    :param text_array: 점수를 매길 질문의 배열
    :return: 예측 확률 배열
    """
    global FEATURE_ARR, VECTORIZER, MODEL
    vectors = VECTORIZER.transform(text_array)
    text_ser = pd.DataFrame(text_array, columns=["full_text"])
    text_ser = add_v1_features(text_ser)
    vec_features = vstack(vectors)
    num_features = text_ser[FEATURE_ARR].astype(float)
    features = hstack([vec_features, num_features])
    return MODEL.predict_proba(features)
Example #2
0
def get_model_probabilities_for_input_texts(text_array):
    """
    Returns an array of probability scores representing
    the likelihood of a question receiving a high score
    format is: [ [prob_low_score1, prob_high_score_1], ... ]
    :param text_array: array of questions to be scored
    :return: array of predicted probabilities
    """
    global FEATURE_ARR, VECTORIZER, MODEL
    vectors = VECTORIZER.transform(text_array)
    text_ser = pd.DataFrame(text_array, columns=["full_text"])
    text_ser = add_v1_features(text_ser)
    vec_features = vstack(vectors)
    num_features = text_ser[FEATURE_ARR].astype(float)
    features = hstack([vec_features, num_features])
    return MODEL.predict_proba(features)
Example #3
0
def generate_model_text_features(raw_df_path, save_path=None):
    """
    A function to generate features for model 2 and save them to disk.
    These features take multiple minutes to compute
    :param raw_df_path: path to raw DataFrame (generated from parse_xml_to_csv)
    :param save_path: path to save processed DataFrame to
    :return: processed DataFrame
    """
    df = pd.read_csv(raw_df_path)
    df = format_raw_df(df.copy())
    df = df.loc[df["is_question"]].copy()
    df["full_text"] = df["Title"].str.cat(df["body_text"], sep=" ", na_rep="")

    df = add_v1_features(df.copy())
    df = add_v2_text_features(df.copy())

    if save_path:
        df.to_csv(save_path)
    return df
def generate_model_text_features(raw_df_path, save_path=None):
    """
    모델 2를 위한 특성을 생성하고 디스크에 저장하는 함수
    이 특성을 계산하는데 몇 분 정도 걸립니다.
    :param raw_df_path: (parse_xml_to_csv에서 생성한) 원본 DataFrame 경로
    :param save_path: 처리된 DataFrame을 저장할 경로
    :return: 처리된 DataFrame
    """
    df = pd.read_csv(raw_df_path)
    df = format_raw_df(df.copy())
    df = df.loc[df["is_question"]].copy()
    df["full_text"] = df["Title"].str.cat(df["body_text"], sep=" ", na_rep="")

    df = add_v1_features(df.copy())
    df = add_v2_text_features(df.copy())

    if save_path:
        df.to_csv(save_path)
    return df