Ejemplo n.º 1
0
    def generate_data_for_significant_nei_utm_ids():
        output_file = GeneralMethods.get_method_id()+'.json'
        so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {}
        for utm_object in \
                FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
            for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
            mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                            utm_object['mf_nei_utm_id_to_common_h_count'].keys()
        hashtags = sorted(list(so_hashtags))
        mf_utm_id_to_vector = {}
        for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
#                print i, utm_object['utm_id']
            utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                 hashtags)
            mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector)
        for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()):
            print '%s of %s'%(i+1, len(mf_utm_id_to_vector))
            ltuo_utm_id_and_vector = [(utm_id, vector)]
            for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]:
                if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id:
                    ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id]))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors)
            dfm_dict = cjson.decode(df_utm_vectors_json)
            mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames))
            utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id]
            dfm_dict['prediction_variable'] = utm_id_colname
            dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname,
                                                     df_utm_vectors.colnames)
            dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0]))
            FileIO.writeToFileAsJson(dfm_dict, output_file)
Ejemplo n.º 2
0
 def mapper(self, key, line):
     data_for_df = cjson.decode(line)
     prediction_variable = data_for_df['prediction_variable']
     predictor_variables = data_for_df['predictor_variables']
     mf_utm_colnames_to_utm_ids = data_for_df['mf_utm_colnames_to_utm_ids']
     del data_for_df['prediction_variable']
     del data_for_df['predictor_variables']
     del data_for_df['mf_utm_colnames_to_utm_ids']
     df_utm_vectors = R_Helper.get_data_frame_from_json(cjson.encode(data_for_df))
     selected_utm_colnames = R_Helper.variable_selection_using_backward_elimination(
                                                                                    df_utm_vectors,
                                                                                    prediction_variable,
                                                                                    predictor_variables
                                                                                 )
     utm_id = mf_utm_colnames_to_utm_ids[prediction_variable]
     nei_utm_ids = [mf_utm_colnames_to_utm_ids[selected_utm_colname]
                    for selected_utm_colname in selected_utm_colnames]
     yield utm_id, {'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}
 def _get_parameter_names_to_values(train_feature_vectors):
     mf_column_name_to_column_data = defaultdict(list)
     train_feature_vectors = map(itemgetter('feature_vector'), train_feature_vectors)
     for feature_vector in train_feature_vectors:
         if feature_vector['value_to_predict']:
             mf_column_name_to_column_data['value_to_predict'].append(feature_vector['value_to_predict'])
             for column_name in LIST_OF_MODELS:
                 mf_column_name_to_column_data[column_name].append(feature_vector.get(column_name, 0.0))
     data = {}
     for column_name, column_data in mf_column_name_to_column_data.iteritems():
         data[column_name] = robjects.FloatVector(column_data)
     if data:
         data_frame = robjects.DataFrame(data)
         prediction_variable = 'value_to_predict'
         predictor_variables = LIST_OF_MODELS
         model = R_Helper.linear_regression_model(
                                                  data_frame,
                                                  prediction_variable,
                                                  predictor_variables,
 #                                                 with_variable_selection=True
                                                 )
         return R_Helper.get_parameter_values(model)
Ejemplo n.º 4
0
    def significant_nei_utm_ids():
        mf_utm_id_to_valid_nei_utm_ids = {}
        def get_utm_vectors():
            so_hashtags = set()
            for utm_object in \
                    FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
                for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                    if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
                mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                                utm_object['mf_nei_utm_id_to_common_h_count'].keys()
            hashtags, ltuo_utm_id_and_vector = sorted(list(so_hashtags)), []
            for i, utm_object in enumerate(FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True)):
#                print i, utm_object['utm_id']
                utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                     hashtags)
                ltuo_utm_id_and_vector.append((utm_object['utm_id'], 
                                               robjects.FloatVector(utm_id_vector)))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            return df_utm_vectors
        output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id()
        df_utm_vectors = get_utm_vectors()
#        print df_utm_vectors.nrow
#        exit()
        utm_colnames = df_utm_vectors.colnames
        mf_utm_id_to_utm_colnames = dict(zip(sorted(mf_utm_id_to_valid_nei_utm_ids), utm_colnames))
        mf_utm_colnames_to_utm_id = dict(zip(utm_colnames, sorted(mf_utm_id_to_valid_nei_utm_ids)))
        for i, utm_colname in enumerate(utm_colnames):
            utm_id = mf_utm_colnames_to_utm_id[utm_colname]
            prediction_variable = utm_colname
            print i, utm_id
            predictor_variables = [mf_utm_id_to_utm_colnames[valid_nei_utm_ids]
                                    for valid_nei_utm_ids in mf_utm_id_to_valid_nei_utm_ids[utm_id]
                                        if valid_nei_utm_ids in mf_utm_id_to_utm_colnames and
                                           valid_nei_utm_ids != utm_id ]
            selected_utm_colnames =  R_Helper.variable_selection_using_backward_elimination(
                                                                                               df_utm_vectors,
                                                                                               prediction_variable,
                                                                                               predictor_variables,
                                                                                               debug=True
                                                                                            )
            nei_utm_ids = [mf_utm_colnames_to_utm_id[selected_utm_colname]
                                for selected_utm_colname in selected_utm_colnames]
            print 'Writing to: ', output_file
            FileIO.writeToFileAsJson({'utm_id': utm_id, 'nei_utm_ids': nei_utm_ids}, output_file)
 def get_performance_metrics(feature_vectors, *args, **kwargs):
     train_feature_vectors, test_feature_vectors = split_feature_vectors_into_test_and_training(feature_vectors)
     filtered_train_feature_vectors = filter(lambda fv: len(fv['feature_vector'])>1, train_feature_vectors)
     filtered_test_feature_vectors = filter(lambda fv: len(fv['feature_vector'])>1, test_feature_vectors)
     
     if filtered_train_feature_vectors and filtered_test_feature_vectors:
         parameter_names_to_values = LearningToRank._get_parameter_names_to_values(filtered_train_feature_vectors)
         if parameter_names_to_values:
             accuracy_mf_num_of_hashtags_to_metric_values = defaultdict(list)
             impact_mf_num_of_hashtags_to_metric_values = defaultdict(list)
             mf_parameter_names_to_values = dict(parameter_names_to_values)
             test_feature_vectors.sort(key=itemgetter('tu'))
             ltuo_tu_and_ltuo_hashtag_and_actual_score_and_feature_vector =\
                                                 [(tu, map(
                                                           itemgetter('hashtag', 'actual_score', 'feature_vector'),
                                                           it_feature_vectors)
                                                       )
                                                     for tu, it_feature_vectors in 
                                                         groupby(test_feature_vectors, key=itemgetter('tu'))
                                                 ]
                 
             for tu, ltuo_hashtag_and_actual_score_and_feature_vector in \
                     ltuo_tu_and_ltuo_hashtag_and_actual_score_and_feature_vector:
                 for _, __, fv in ltuo_hashtag_and_actual_score_and_feature_vector: del fv['value_to_predict']
                 ltuo_hashtag_and_actual_score_and_predicted_score =\
                                 map(lambda (hashtag, actual_score, feature_vector): 
                                         (
                                          hashtag,
                                          actual_score,
                                          R_Helper.get_predicted_value(mf_parameter_names_to_values, feature_vector)
                                         ),
                                     ltuo_hashtag_and_actual_score_and_feature_vector)
                 ltuo_hashtag_and_actual_score = [ (hashtag, actual_score)
                                                  for hashtag, actual_score, _ in
                                                         ltuo_hashtag_and_actual_score_and_predicted_score 
                                                     if actual_score!=None]
                 ltuo_hashtag_and_predicted_score = map(
                                                     itemgetter(0,2),
                                                     ltuo_hashtag_and_actual_score_and_predicted_score
                                                     )
                 
                 if ltuo_hashtag_and_actual_score and ltuo_hashtag_and_predicted_score:
                     
                     ltuo_hashtag_and_actual_score = sorted(
                                                            ltuo_hashtag_and_actual_score,
                                                            key=itemgetter(1),
                                                            reverse=True
                                                            )
                     ltuo_hashtag_and_predicted_score = sorted(
                                                            ltuo_hashtag_and_predicted_score,
                                                            key=itemgetter(1),
                                                            reverse=True
                                                            )
                     
                     for num_of_hashtags in NUM_OF_HASHTAGS:
                         hashtags_dist = dict(ltuo_hashtag_and_actual_score)
                         actual_ordering_of_hashtags = zip(*ltuo_hashtag_and_actual_score)[0]
                         predicted_ordering_of_hashtags = zip(*ltuo_hashtag_and_predicted_score)[0]
                         
                         accuracy = EvaluationMetric.accuracy(
                                                               actual_ordering_of_hashtags[:num_of_hashtags],
                                                               predicted_ordering_of_hashtags[:num_of_hashtags],
                                                               num_of_hashtags
                                                             )
                         impact = EvaluationMetric.impact(
                                                         actual_ordering_of_hashtags[:num_of_hashtags],
                                                         predicted_ordering_of_hashtags[:num_of_hashtags],
                                                         hashtags_dist
                                                       )
                         accuracy_mf_num_of_hashtags_to_metric_values[num_of_hashtags].append(accuracy)
                         impact_mf_num_of_hashtags_to_metric_values[num_of_hashtags].append(impact)
             return (accuracy_mf_num_of_hashtags_to_metric_values, impact_mf_num_of_hashtags_to_metric_values)
     return {}, {}