Ejemplo n.º 1
0
 def load_rpy2_rc(self, algorithms_dirpath=''):
     """ This object contains hooks to R
     """
     if len(algorithms_dirpath) == 0:
         algorithms_dirpath = self.pars.get('algorithms_dirpath', '')
     self.rc = rpy2_classifiers.Rpy2Classifier(
         algorithms_dirpath=algorithms_dirpath)
    def main(self):

        algo_code_dirpath = os.path.abspath(os.environ.get("TCP_DIR")+'Algorithms')
        sys.path.append(algo_code_dirpath)
        import rpy2_classifiers

        self.rc = rpy2_classifiers.Rpy2Classifier(algorithms_dirpath=algo_code_dirpath)

        
        arff_str = open(self.pars['orig_aper_arff']).read()
        orig_aper_dict = self.rc.parse_full_arff(arff_str=arff_str,
                                                 skip_missingval_lines=False)
        orig_srcid_dict = {}
        for i, srcid in enumerate(orig_aper_dict['srcid_list']):
            orig_srcid_dict[srcid] = i

        arff_str = open(self.pars['small_aper_arff']).read()
        small_aper_dict = self.rc.parse_full_arff(arff_str=arff_str,
                                                 skip_missingval_lines=False)
        small_srcid_dict = {}
        for i, srcid in enumerate(small_aper_dict['srcid_list']):
            small_srcid_dict[srcid] = i

        arff_str = open(self.pars['large_aper_arff']).read()
        large_aper_dict = self.rc.parse_full_arff(arff_str=arff_str,
                                                 skip_missingval_lines=False)
        large_srcid_dict = {}
        for i, srcid in enumerate(large_aper_dict['srcid_list']):
            large_srcid_dict[srcid] = i

        #self.summarize_feats(orig_dict=orig_aper_dict,
        #                     orig_srcid_dict=orig_srcid_dict,
        #                     perturb_dict=small_aper_dict,
        #                     perturb_srcid_dict=small_srcid_dict)

        self.summarize_feats(orig_dict=orig_aper_dict,
                             orig_srcid_dict=orig_srcid_dict,
                             perturb_dict=large_aper_dict,
                             perturb_srcid_dict=large_srcid_dict)

        import pdb; pdb.set_trace()
        print
Ejemplo n.º 3
0
def example_initial_r_randomforest():
    """ Initial example which trains and classifies a R randomForest classifier
    Using 1 40/60 fold of debosscher data.
    """

    algorithms_dirpath = os.path.abspath(os.environ.get("TCP_DIR") + 'Algorithms/')
    sys.path.append(algorithms_dirpath)
    import rpy2_classifiers
    rc = rpy2_classifiers.Rpy2Classifier(algorithms_dirpath=algorithms_dirpath)

    train_arff_str = open(os.path.expandvars("$HOME/scratch/full_deboss_1542srcs_20110106.arff")).read()
    traindata_dict = rc.parse_full_arff(arff_str=train_arff_str)

    Gen_Fold_Classif = rpy2_classifiers.GenerateFoldedClassifiers()
    all_fold_data = Gen_Fold_Classif.generate_fold_subset_data(full_data_dict=traindata_dict,
                                                               n_folds=10,
                                                               do_stratified=False,
                                                               classify_percent=40.)
    i_fold = 0  # of 10 folds
    fold_data = all_fold_data[i_fold]
    do_ignore_NA_features = False
    classifier_fpath = os.path.expandvars("$HOME/scratch/classifier_RF_0.rdata")# % (i_fold))
    Gen_Fold_Classif.generate_R_randomforest_classifier_rdata(train_data=fold_data['train_data'],
                                                     classifier_fpath=classifier_fpath,
                                                     do_ignore_NA_features=do_ignore_NA_features,
                                                     algorithms_dirpath=algorithms_dirpath)

    r_name='rf_clfr'
    classifier_dict = {'class_name':r_name}
    rc.load_classifier(r_name=r_name,
                   fpath=classifier_fpath)
    classif_results = rc.apply_randomforest(classifier_dict=classifier_dict,
                                    data_dict=fold_data['classif_data'],
                                    do_ignore_NA_features=do_ignore_NA_features)

    print "classif_results['error_rate']=", classif_results['error_rate']
    import pdb; pdb.set_trace()
    print
        pars['arff_fpath'] = '/home/dstarr/Data/starvars/combined_acvs.arff'
        skip_missingval_lines = False  # this is False in lcs_classify # we skip sources which have missing values
        mtry = 5
        ntree = 10  # 100 has little difference from 10

        test_arff_str = open(pars['arff_fpath']).read()

        algo_code_dirpath = os.path.abspath(
            os.environ.get("TCP_DIR") + 'Algorithms')
        sys.path.append(algo_code_dirpath)
        import rpy2_classifiers

        tmp_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

        rc = rpy2_classifiers.Rpy2Classifier(
            algorithms_dirpath=algo_code_dirpath)
        sys.stdout.close()
        sys.stdout = tmp_stdout

        testdata_dict = rc.parse_full_arff(
            arff_str=test_arff_str,
            skip_missingval_lines=skip_missingval_lines)
        remove_sources_with_many_missing_attribs(testdata_dict,
                                                 exclude_feats=['freq_signif'])

        # KLUDGEY conversion of missing-value features None into numpy.nan:
        new_featname_longfeatval_dict = {}
        for feat_name, feat_longlist in testdata_dict[
                'featname_longfeatval_dict'].iteritems():
            new_list = []
            if feat_name in ignore_feats_list:
Ejemplo n.º 5
0
def main():
    """ Main function
    """
    if len(sys.argv) < 2:
        return {}
    timeseries_url = sys.argv[1]

    t_list = []
    m_list = []
    merr_list = []
    try:
        f = urllib.urlopen(timeseries_url)
        ts_str = f.read()
        f.close()
        ts_list = eval(ts_str)
        for tup in ts_list:
            t_list.append(float(tup[0]))
            m_list.append(float(tup[1]))
            merr_list.append(float(tup[2]))
    except:
        return {}
    if len(t_list) == 0:
        return {}

    data_str_list = []
    for i, t in enumerate(t_list):
        data_str = '              <TR row="%d"><TD>%lf</TD><TD>%lf</TD><TD>%lf</TD></TR>' % \
                                  (i, t, m_list[i], merr_list[i])
        data_str_list.append(data_str)
    all_data_str = '\n'.join(data_str_list)
    out_xml = head_str + all_data_str + tail_str
    ### This generates a xml which contains features:
    #feat_xml_str = generate_feature_xml_using_raw_xml(out_xml)
    #print feat_xml_str

    ### This generates an arff, which contains features:
    test_arff_str = generate_arff_using_raw_xml(out_xml)

    algo_code_dirpath = os.path.abspath(
        os.environ.get("TCP_DIR") + 'Algorithms')
    tmp_stdout = sys.stdout
    sys.stdout = open(os.devnull, 'w')

    rc = rpy2_classifiers.Rpy2Classifier(algorithms_dirpath=algo_code_dirpath)
    sys.stdout.close()
    sys.stdout = tmp_stdout

    classifier_dict = rc.read_randomForest_classifier_into_dict(
        r_name='rf.tr',
        r_classifier_fpath="/home/dstarr/scratch/macc_wrapper_rfclfr.rdat")
    #                                                 r_classifier_fpath="/Data/dstarr/src/ASASCatalog/data/asas_randomForest.Rdat")
    # NOTE: see activelearn_utils.py:L696
    # TODO: will want to ensure that the classifier applies to the same features as in the arff_str

    #do_ignore_NA_features = False # This is a strange option to set True, which would essentially skip a featrue from being used.  The hardcoded-exclusion features are sdss and ws_ related.
    skip_missingval_lines = False  # we skip sources which have missing values

    if 0:
        #train_arff_str = open(os.path.expandvars("/media/raid_0/historical_archive_featurexmls_arffs/tutor_123/2011-02-05_23:43:21.830763/source_feats.arff")).read()
        traindata_dict = rc.parse_full_arff(
            arff_str=train_arff_str,
            skip_missingval_lines=skip_missingval_lines)
        #import pdb; pdb.set_trace()
        classifier_dict = rc.train_randomforest(
            traindata_dict,
            do_ignore_NA_features=do_ignore_NA_features,
            mtry=15,
            ntrees=1000,
            nodesize=1)

        r_name = 'rf_clfr'
        classifier_dict = {'class_name': r_name}

    #test_arff_str = open(os.path.expandvars("/media/raid_0/historical_archive_featurexmls_arffs/tutor_126/2011-02-06_00:03:02.699641/source_feats.arff")).read()
    testdata_dict = rc.parse_full_arff(
        arff_str=test_arff_str, skip_missingval_lines=skip_missingval_lines)

    # features to ignore in the testing data:
    ignore_feats_list = [
        "color_bv_extinction", "color_diff_bj", "color_diff_hk",
        "color_diff_jh", "color_diff_rj", "color_diff_vj", "ar_is_theta",
        "ar_is_sigma", "delta_phase_2minima", "gskew",
        "lcmodel_median_n_per_day", "lcmodel_neg_n_per_day",
        "lcmodel_pos_area_ratio", "lcmodel_pos_mag_ratio",
        "lcmodel_pos_n_per_day", "lcmodel_pos_n_ratio",
        "phase_dispersion_freq0", "freq1_harmonics_rel_phase_0",
        "freq2_harmonics_rel_phase_0", "freq3_harmonics_rel_phase_0",
        "ratio_PDM_LS_freq0", "n_points"
    ]

    classif_results = rc.apply_randomforest(
        classifier_dict=classifier_dict,
        data_dict=testdata_dict,
        #do_ignore_NA_features=do_ignore_NA_features,
        return_prediction_probs=True,
        ignore_feats=ignore_feats_list)
    # classif_results['predictions']['tups']
    # srcid, rank, prob, class_name
    #import pdb; pdb.set_trace()
    #print

    #out = self.retrieve_tutor_class_ids()
    #rclass_tutorid_lookup = out['rclass_tutorid_lookup']

    final_dict = {}
    #  form:     [(236518, 0, 0.36299999999999999, 'b. Semireg PV'), ...
    for tup in classif_results['predictions']['tups']:
        final_dict[tup[3]] = tup[2]

    return final_dict