def create_dataset_2_sample_size(input_path, output_path):
    essays = Essays(input_path)  
    
    LABEL = essays.apply_cell_function("label",identity)
    READ_1_SCORE = essays.apply_cell_function("read_1_score",identity)
    READ_2_SCORE = essays.apply_cell_function("read_2_score",identity)
    FINAL_SCORE = essays.apply_cell_function("final_score",identity)
    
    # prepares text
    print "Preparing text..."
    RAW_TEXTS = essays.apply_cell_function("data_answer",identity)
    RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper), parallel=True)
    TEXTS = essays.apply_cell_function("data_answer",identity)
    
    print "Cleaning text..."    
    TEXTS = clean_text(TEXTS)
    
    print "Simplifying math expressions..."
    math_essays = [key for key in TEXTS.keys() if key.startswith("53299")]
    for key in math_essays:
        TEXTS[key] = TEXTS[key].map(simplify_math)
    
    print "Spellchecking..."
    TEXTS = func_over_dict(TEXTS, apply_map_func(spellcheck), parallel=False)    
    
    print "Reducing vocabulary..."
    TEXTS = func_over_dict(TEXTS, reduce_vocabulary_func, parallel=True)
    
    print "Stemming..."
    TEXTS = func_over_dict(TEXTS, apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()])), parallel=True)
    
    bow_args = {'min_df':2,'ngram_range':(1,1),'stop_words':'english','tokenizer':lambda x: x.split()}
    BOW_1_GRAM       = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args), parallel=True)

    META_LABEL       = func_over_dict(LABEL, lambda x: pd.DataFrame({'META_LABEL':x}), parallel=True)
    META_SCORE_1     = func_over_dict(READ_1_SCORE, lambda x: pd.DataFrame({'META_SCORE_1':x}), parallel=True)
    META_SCORE_2     = func_over_dict(READ_2_SCORE, lambda x: pd.DataFrame({'META_SCORE_2':x}), parallel=True)
    META_SCORE_FINAL = func_over_dict(FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL':x}), parallel=True)
    TEXT_STATISTICS  = func_over_dict(RAW_TEXTS, text_statistics, parallel=True)
    QUOTATIONS_NUM   = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: len(re.findall('"(.*?)"',x)))), lambda x: pd.DataFrame({'QUOTATIONS_NUM':x}), parallel=True)
    YES_POSITION     = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))), lambda x: pd.DataFrame({'YES_POSITION':x}), parallel=True)
    NO_POSITION      = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))), lambda x: pd.DataFrame({'NO_POSITION':x}), parallel=True)
    
    dataset = merge_dataframes([
        META_LABEL
       ,META_SCORE_1
       ,META_SCORE_2
       ,META_SCORE_FINAL
       ,BOW_1_GRAM
       ,TEXT_STATISTICS
       ,QUOTATIONS_NUM
       ,YES_POSITION
       ,NO_POSITION
    ])
    
    dataset = [dataset, dataset]

    joblib.dump(dataset,output_path)
def create_dataset_2_SE():
    essays = Essays("data_work/items_data_se/*.csv")   
    
    LABEL = essays.apply_cell_function("label",identity)
    READ_1_SCORE = essays.apply_cell_function("read_1_score",identity)
    READ_2_SCORE = essays.apply_cell_function("read_2_score",identity)
    FINAL_SCORE = essays.apply_cell_function("final_score",identity)
    
    # prepares text
    print "Preparing text..."
    RAW_TEXTS = essays.apply_cell_function("data_answer",identity)
    RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper))
    TEXTS = essays.apply_cell_function("data_answer",identity)
    
    print "Cleaning text..."    
    TEXTS = clean_text(TEXTS)
    
    print "Spellchecking..."
    TEXTS = func_over_dict(TEXTS, apply_map_func(lambda x: spellcheck(x,exclude=["EQUATIONINCORRECT","EQUATIONINCORRECT"])))
    #for key in ["5_53299","7_46793","3_51802","7_46597"]:
    #    TEXTS[key] = TEXTS[key].map(simplify_math)
    
    print "Reducing vocabulary..."
    TEXTS = reduce_vocabulary_dict(TEXTS)
    
    print "Stemming..."
    TEXTS = func_over_dict(TEXTS, apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()])))
    
    bow_args = {'min_df':2,'ngram_range':(1,1),'stop_words':'english','tokenizer':lambda x: x.split()}
    BOW_1_GRAM       = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args))

    #bow_args = {'min_df':5,'ngram_range':(2,2),'stop_words':'english','tokenizer':lambda x: x.split()}
    #BOW_2_GRAM       = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args))

    #bow_args = {'min_df':5,'ngram_range':(3,3),'stop_words':'english','tokenizer':lambda x: x.split()}
    #BOW_3_GRAM       = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args))

    META_LABEL       = func_over_dict(LABEL, lambda x: pd.DataFrame({'META_LABEL':x}))
    META_SCORE_1     = func_over_dict(READ_1_SCORE, lambda x: pd.DataFrame({'META_SCORE_1':x}))
    META_SCORE_2     = func_over_dict(READ_2_SCORE, lambda x: pd.DataFrame({'META_SCORE_2':x}))
    META_SCORE_FINAL = func_over_dict(FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL':x}))
    TEXT_STATISTICS  = func_over_dict(RAW_TEXTS, text_statistics)
    QUOTATIONS_NUM   = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: len(re.findall('"(.*?)"',x)))), lambda x: pd.DataFrame({'QUOTATIONS_NUM':x}))
    YES_POSITION     = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))), lambda x: pd.DataFrame({'YES_POSITION':x}))
    NO_POSITION      = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))), lambda x: pd.DataFrame({'NO_POSITION':x}))
    
    dataset = merge_dataframes([
        META_LABEL
       ,META_SCORE_1
       ,META_SCORE_2
       ,META_SCORE_FINAL
       ,BOW_1_GRAM
       #,BOW_2_GRAM
       #,BOW_3_GRAM
       ,TEXT_STATISTICS
       ,QUOTATIONS_NUM
       ,YES_POSITION
       ,NO_POSITION
    ])
    
    dataset = [dataset, dataset]

    joblib.dump(dataset,"data_work/datasets/dataset_2_SE")
Exemple #3
0
def create_dataset_2_gaming():
    essays = Essays("data_work/items_data_gaming/*.csv")

    LABEL = essays.apply_cell_function("label", identity)
    READ_1_SCORE = essays.apply_cell_function("read_1_score", identity)
    READ_2_SCORE = essays.apply_cell_function("read_2_score", identity)
    FINAL_SCORE = essays.apply_cell_function("final_score", identity)

    # prepares text
    print "Preparing text..."
    RAW_TEXTS = essays.apply_cell_function("data_answer", identity)
    RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper))
    TEXTS = essays.apply_cell_function("data_answer", identity)

    print "Cleaning text..."
    TEXTS = clean_text(TEXTS)

    print "Spellchecking..."
    TEXTS = func_over_dict(
        TEXTS,
        apply_map_func(lambda x: spellcheck(
            x, exclude=["EQUATIONINCORRECT", "EQUATIONINCORRECT"])))
    #for key in ["5_53299","7_46793","3_51802","7_46597"]:
    #    TEXTS[key] = TEXTS[key].map(simplify_math)

    print "Reducing vocabulary..."
    TEXTS = reduce_vocabulary_dict(TEXTS)

    print "Stemming..."
    TEXTS = func_over_dict(
        TEXTS,
        apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()])))

    bow_args = {
        'min_df': 2,
        'ngram_range': (1, 1),
        'stop_words': 'english',
        'tokenizer': lambda x: x.split()
    }
    BOW_1_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x, **bow_args))

    #bow_args = {'min_df':5,'ngram_range':(2,2),'stop_words':'english','tokenizer':lambda x: x.split()}
    #BOW_2_GRAM       = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args))

    #bow_args = {'min_df':5,'ngram_range':(3,3),'stop_words':'english','tokenizer':lambda x: x.split()}
    #BOW_3_GRAM       = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args))

    META_LABEL = func_over_dict(LABEL,
                                lambda x: pd.DataFrame({'META_LABEL': x}))
    META_SCORE_1 = func_over_dict(READ_1_SCORE,
                                  lambda x: pd.DataFrame({'META_SCORE_1': x}))
    META_SCORE_2 = func_over_dict(READ_2_SCORE,
                                  lambda x: pd.DataFrame({'META_SCORE_2': x}))
    META_SCORE_FINAL = func_over_dict(
        FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL': x}))
    TEXT_STATISTICS = func_over_dict(RAW_TEXTS, text_statistics)
    QUOTATIONS_NUM = func_over_dict(
        func_over_dict(
            RAW_TEXTS,
            apply_map_func(lambda x: len(re.findall('"(.*?)"', x)))),
        lambda x: pd.DataFrame({'QUOTATIONS_NUM': x}))
    YES_POSITION = func_over_dict(
        func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))),
        lambda x: pd.DataFrame({'YES_POSITION': x}))
    NO_POSITION = func_over_dict(
        func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))),
        lambda x: pd.DataFrame({'NO_POSITION': x}))

    dataset = merge_dataframes([
        META_LABEL,
        META_SCORE_1,
        META_SCORE_2,
        META_SCORE_FINAL,
        BOW_1_GRAM
        #,BOW_2_GRAM
        #,BOW_3_GRAM
        ,
        TEXT_STATISTICS,
        QUOTATIONS_NUM,
        YES_POSITION,
        NO_POSITION
    ])

    dataset = [dataset, dataset]

    joblib.dump(dataset, "data_work/datasets/dataset_2_gaming")
Exemple #4
0
def create_dataset_2_sample_size(input_path, output_path):
    essays = Essays(input_path)

    LABEL = essays.apply_cell_function("label", identity)
    READ_1_SCORE = essays.apply_cell_function("read_1_score", identity)
    READ_2_SCORE = essays.apply_cell_function("read_2_score", identity)
    FINAL_SCORE = essays.apply_cell_function("final_score", identity)

    # prepares text
    print "Preparing text..."
    RAW_TEXTS = essays.apply_cell_function("data_answer", identity)
    RAW_TEXTS = func_over_dict(RAW_TEXTS,
                               apply_map_func(string.upper),
                               parallel=True)
    TEXTS = essays.apply_cell_function("data_answer", identity)

    print "Cleaning text..."
    TEXTS = clean_text(TEXTS)

    print "Simplifying math expressions..."
    math_essays = [key for key in TEXTS.keys() if key.startswith("53299")]
    for key in math_essays:
        TEXTS[key] = TEXTS[key].map(simplify_math)

    print "Spellchecking..."
    TEXTS = func_over_dict(TEXTS, apply_map_func(spellcheck), parallel=False)

    print "Reducing vocabulary..."
    TEXTS = func_over_dict(TEXTS, reduce_vocabulary_func, parallel=True)

    print "Stemming..."
    TEXTS = func_over_dict(
        TEXTS,
        apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()])),
        parallel=True)

    bow_args = {
        'min_df': 2,
        'ngram_range': (1, 1),
        'stop_words': 'english',
        'tokenizer': lambda x: x.split()
    }
    BOW_1_GRAM = func_over_dict(TEXTS,
                                lambda x: bag_of_words(x, **bow_args),
                                parallel=True)

    META_LABEL = func_over_dict(LABEL,
                                lambda x: pd.DataFrame({'META_LABEL': x}),
                                parallel=True)
    META_SCORE_1 = func_over_dict(READ_1_SCORE,
                                  lambda x: pd.DataFrame({'META_SCORE_1': x}),
                                  parallel=True)
    META_SCORE_2 = func_over_dict(READ_2_SCORE,
                                  lambda x: pd.DataFrame({'META_SCORE_2': x}),
                                  parallel=True)
    META_SCORE_FINAL = func_over_dict(
        FINAL_SCORE,
        lambda x: pd.DataFrame({'META_SCORE_FINAL': x}),
        parallel=True)
    TEXT_STATISTICS = func_over_dict(RAW_TEXTS, text_statistics, parallel=True)
    QUOTATIONS_NUM = func_over_dict(
        func_over_dict(
            RAW_TEXTS,
            apply_map_func(lambda x: len(re.findall('"(.*?)"', x)))),
        lambda x: pd.DataFrame({'QUOTATIONS_NUM': x}),
        parallel=True)
    YES_POSITION = func_over_dict(func_over_dict(
        RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))),
                                  lambda x: pd.DataFrame({'YES_POSITION': x}),
                                  parallel=True)
    NO_POSITION = func_over_dict(func_over_dict(
        RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))),
                                 lambda x: pd.DataFrame({'NO_POSITION': x}),
                                 parallel=True)

    dataset = merge_dataframes([
        META_LABEL, META_SCORE_1, META_SCORE_2, META_SCORE_FINAL, BOW_1_GRAM,
        TEXT_STATISTICS, QUOTATIONS_NUM, YES_POSITION, NO_POSITION
    ])

    dataset = [dataset, dataset]

    joblib.dump(dataset, output_path)
        ,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0)
        
        ,EssaySkipgram(name="LETTER",source="clean",base=lambda text: text, nskip=0, ngram=3)
        ,EssaySkipgram(name="WORD",source="clean",base=lambda text: text.split(), nskip=0, ngram=1)
    ]
}

pipeline_2 = {
    "name":"DATASET_2",
    "steps":[
         EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text)
        ,EssayTextConversion(source="clean",dest="clean",fun=text_to_math)
        ,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()]))
        ,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags]))
        
        ,EssayFeature(fun=lambda essay: get_math_expressions_features(essay.texts["clean"]))
        ,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0)
        ,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0)
        ,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0)
        ,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0)
        ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0)
        ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0)
        ,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0)
        ,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0)
        ,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)