def write_arff_using_Maker(self, master_features, all_class_list, \
          master_classes, master_list, \
          out_arff_fpath='', \
                               n_sources_needed_for_class_inclusion=10):
        """ Use arffify.py method to write a .arrf file.
	"""
        a = arffify.Maker(search=[],
                          skip_class=False,
                          local_xmls=True,
                          convert_class_abrvs_to_names=False,
                          flag_retrieve_class_abrvs_from_TUTOR=False,
                          dorun=False,
                          add_srcid_to_arff=True)
        a.master_features = master_features
        a.all_class_list = all_class_list
        a.master_classes = master_classes
        a.master_list = master_list
        a.write_arff(outfile=out_arff_fpath, \
                            remove_sparse_classes=True, \
                            n_sources_needed_for_class_inclusion=n_sources_needed_for_class_inclusion)#, classes_arff_str='', remove_sparse_classes=False)
예제 #2
0
    def __init__(self, class_schema_definition_dicts={}, class_abrv_lookup={}, \
                 use_weka_jvm=True, training_arff_features_list=[]):
        self.class_schema_definition_dicts = class_schema_definition_dicts
        self.training_arff_features_list = training_arff_features_list
        self.arffmaker = arffify.Maker(search=[], \
                                skip_class=True, local_xmls=True, dorun=False, \
                                class_abrv_lookup=class_abrv_lookup)
        if use_weka_jvm:
            # TODO/NOTE: I think a WekaClassifier() class needs to be
            #       instantiated for each WEKA classification instance
            #       which uses a different .model and/or training .arff
            # We initialize a Java virtual machine for Weka classifications
            #try:
            if not jpype.isJVMStarted():
                #TODO / DEBUG: disable the next line for speed-ups once stable?
                _jvmArgs = ["-ea"]  # enable assertions
                _jvmArgs.append("-Djava.class.path=" + \
                                       os.environ["CLASSPATH"])
                ###20091905 dstarr comments out:
                #_jvmArgs.append("-Xmx1000m")
                _jvmArgs.append(
                    "-Xmx12000m"
                )  # 4000 & 5000m works, 3500m doesnt for some WEKA .models
                jpype.startJVM(jpype.getDefaultJVMPath(), *_jvmArgs)

            class_schema_name_list = self.class_schema_definition_dicts.keys()
            class_schema_name_list.remove('mlens3 MicroLens')
            class_schema_name_list.remove('Dovi SN')
            class_schema_name_list.remove('General')
            self.wc = {}
            for class_schema_name in class_schema_name_list:
                class_schema_dict = self.class_schema_definition_dicts[
                    class_schema_name]
                weka_training_model_fpath = class_schema_dict[
                    'weka_training_model_fpath']
                weka_training_arff_fpath = class_schema_dict[
                    'weka_training_arff_fpath']
                self.wc[class_schema_name] = weka_classifier.WekaClassifier( \
                                      weka_training_model_fpath, weka_training_arff_fpath)
예제 #3
0
if __name__ == '__main__':
    ### NOTE: this __main__ section will only be executed when the python script is called like:
    ###   python generate_arff_using_xml.py
    ###         or
    ###   ./generate_arff_using_xml.py

    xml_dirpath = "/home/dstarr/scratch/xml_list"
    out_arff_filepath = "/tmp/gen.arff"  # to be written

    filepaths = glob.glob("%s/*xml" % (xml_dirpath))

    vosource_list = []
    for num, fpath in enumerate(filepaths):
        ### NOTE: I'm using 'num' as a psedudo source-id for identification in the .arff file.
        vosource_list.append(
            (str(num), fpath))  # NOTE: a tuple of this form is needed.

    a = arffify.Maker(search=[],
                      skip_class=False,
                      local_xmls=True,
                      convert_class_abrvs_to_names=False,
                      flag_retrieve_class_abrvs_from_TUTOR=False,
                      dorun=False,
                      add_srcid_to_arff=True)
    a.pars['skip_sci_class_list'] = [
    ]  # NOTE: this means that all sources will be added to the .arff, regardless of how ambigous the classification is.
    a.populate_features_and_classes_using_local_xmls(
        srcid_xml_tuple_list=vosource_list)
    a.write_arff(outfile=out_arff_filepath, \
                     remove_sparse_classes=False)#, remove_sparse_classes=True, n_sources_needed_for_class_inclusion=10) # this parameter allows you to exclude science-classes from the .arff by requiring a certain number of examples to exist.
예제 #4
0
def generate_arff_using_raw_xml(xml_str):
    """ This generates an arff, which contains features
    """
    master_list = []
    master_features_dict = {}
    all_class_list = []
    master_classes_dict = {}

    new_srcid = 1
    include_arff_header = True

    ### Generate the features:
    tmp_stdout = sys.stdout
    sys.stdout = open(os.devnull, 'w')
    signals_list = []
    gen = generators_importers.from_xml(signals_list)
    gen.generate(xml_handle=xml_str)
    gen.sig.add_features_to_xml_string(signals_list)
    gen.sig.x_sdict['src_id'] = new_srcid
    dbi_src = db_importer.Source(make_dict_if_given_xml=False)
    dbi_src.source_dict_to_xml(gen.sig.x_sdict)
    sys.stdout.close()
    sys.stdout = tmp_stdout

    xml_fpath = dbi_src.xml_string

    a = arffify.Maker(search=[],
                      skip_class=False,
                      local_xmls=True,
                      convert_class_abrvs_to_names=False,
                      flag_retrieve_class_abrvs_from_TUTOR=False,
                      dorun=False)
    out_dict = a.generate_arff_line_for_vosourcexml(num=new_srcid,
                                                    xml_fpath=xml_fpath)

    master_list.append(out_dict)
    all_class_list.append(out_dict['class'])
    master_classes_dict[out_dict['class']] = 0
    for feat_tup in out_dict['features']:
        master_features_dict[
            feat_tup] = 0  # just make sure there is this key in the dict.  0 is filler

    master_features = master_features_dict.keys()
    master_classes = master_classes_dict.keys()
    a = arffify.Maker(search=[],
                      skip_class=True,
                      local_xmls=True,
                      convert_class_abrvs_to_names=False,
                      flag_retrieve_class_abrvs_from_TUTOR=False,
                      dorun=False,
                      add_srcid_to_arff=True)
    a.master_features = master_features
    a.all_class_list = all_class_list
    a.master_classes = master_classes
    a.master_list = master_list

    fp_out = cStringIO.StringIO()
    a.write_arff(outfile=fp_out, \
                 remove_sparse_classes=True, \
                 n_sources_needed_for_class_inclusion=1,
                 include_header=include_arff_header,
                 use_str_srcid=True)#, classes_arff_str='', remove_sparse_classes=False)
    arff_str = fp_out.getvalue()
    return arff_str
예제 #5
0
    def generate_arff_using_asasdat(self,
                                    data_fpaths=[],
                                    include_arff_header=False,
                                    arff_output_fp=None):
        """ Given a list of LINEAR data file filepaths, for each source/file:
        - choose the optimal aperture, depending upon median magnitude <---only for ASAS!!!
        - exclude bad/flagged epochs
        - generate features from timeseries (placing in intermediate XML-string format)
        - collect resulting features for all given sources, and place in ARFF style file
              which will later be read by ML training/classification code.
              
        Partially adapted from: TCP/Software/citris33/arff_generation_master_using_generic_ts_data.py:get_dat_arffstrs()
        """
        import tutor_database_project_insert
        adt = tutor_database_project_insert.ASAS_Data_Tools(pars=pars)
        adt.frame_limitmags = self.retrieve_limitmags_from_pkl()

        sys.path.append(
            os.environ.get('TCP_DIR') + '/Software/feature_extract/MLData')
        #sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + '/Software/feature_extract/Code/extractors'))
        #print os.environ.get("TCP_DIR")
        import arffify

        sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \
                      'Software/feature_extract/Code'))
        import db_importer
        from data_cleaning import sigmaclip_sdict_ts
        sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \
                      'Software/feature_extract'))
        from Code import generators_importers

        master_list = []
        master_features_dict = {}
        all_class_list = []
        master_classes_dict = {}

        for dat_fpath in data_fpaths:
            new_srcid = dat_fpath[dat_fpath.rfind('/') +
                                  1:dat_fpath.rfind('.dat')]
            ts_str = open(dat_fpath).read()
            source_intermed_dict = adt.parse_asas_ts_data_str(ts_str)
            """mag_data_dict = adt.filter_best_ts_aperture(source_intermed_dict)
            """
            # Need to have a function like this for LINEAR data:

            xml_str = self.form_xml_string(mag_data_dict)

            ### Generate the features:
            signals_list = []
            gen = generators_importers.from_xml(signals_list)
            gen.generate(xml_handle=xml_str)
            gen.sig.add_features_to_xml_string(signals_list)
            gen.sig.x_sdict['src_id'] = new_srcid
            dbi_src = db_importer.Source(make_dict_if_given_xml=False)
            dbi_src.source_dict_to_xml(gen.sig.x_sdict)

            xml_fpath = dbi_src.xml_string

            a = arffify.Maker(search=[],
                              skip_class=False,
                              local_xmls=True,
                              convert_class_abrvs_to_names=False,
                              flag_retrieve_class_abrvs_from_TUTOR=False,
                              dorun=False)
            out_dict = a.generate_arff_line_for_vosourcexml(
                num=new_srcid, xml_fpath=xml_fpath)

            master_list.append(out_dict)
            all_class_list.append(out_dict['class'])
            master_classes_dict[out_dict['class']] = 0
            for feat_tup in out_dict['features']:
                master_features_dict[
                    feat_tup] = 0  # just make sure there is this key in the dict.  0 is filler

        master_features = master_features_dict.keys()
        master_classes = master_classes_dict.keys()
        a = arffify.Maker(search=[],
                          skip_class=False,
                          local_xmls=True,
                          convert_class_abrvs_to_names=False,
                          flag_retrieve_class_abrvs_from_TUTOR=False,
                          dorun=False,
                          add_srcid_to_arff=True)
        a.master_features = master_features
        a.all_class_list = all_class_list
        a.master_classes = master_classes
        a.master_list = master_list


        a.write_arff(outfile=arff_output_fp, \
                     remove_sparse_classes=True, \
                     n_sources_needed_for_class_inclusion=1,
                     include_header=include_arff_header,
                     use_str_srcid=True)#, classes_arff_str='', remove_sparse_classes=False)
class StarVars_LINEAR_Feature_Generation:
    """
    """
    def __init__(self, pars={}):
        self.head_str = """<?xml version="1.0"?>
<VOSOURCE version="0.04">
	<COOSYS ID="J2000" equinox="J2000." epoch="J2000." system="eq_FK5"/>
  <history>
    <created datetime="2009-12-02 20:56:18.880560" codebase="db_importer.pyc" codebase_version="9-Aug-2007"/>
  </history>
  <ID>6930531</ID>
  <WhereWhen>
    <Description>Best positional information of the source</Description>
    <Position2D unit="deg">
      <Value2>
        <c1>323.47114731</c1>
        <c2>-0.79916734036</c2>
      </Value2>
      <Error2>
        <c1>0.000277777777778</c1>
        <c2>0.000277777777778</c2>
      </Error2>
    </Position2D>
  </WhereWhen>
  <VOTimeseries version="0.04">
    <TIMESYS>
			<TimeType ucd="frame.time.system?">MJD</TimeType> 
			<TimeZero ucd="frame.time.zero">0.0 </TimeZero>
			<TimeSystem ucd="frame.time.scale">UTC</TimeSystem> 
			<TimeRefPos ucd="pos;frame.time">TOPOCENTER</TimeRefPos>
		</TIMESYS>

    <Resource name="db photometry">
        <TABLE name="v">
          <FIELD name="t" ID="col1" system="TIMESYS" datatype="float" unit="day"/>
          <FIELD name="m" ID="col2" ucd="phot.mag;em.opt.v" datatype="float" unit="mag"/>
          <FIELD name="m_err" ID="col3" ucd="stat.error;phot.mag;em.opt.v" datatype="float" unit="mag"/>
          <DATA>
            <TABLEDATA>
"""

        self.tail_str = """              </TABLEDATA>
            </DATA>
          </TABLE>
        </Resource>
      </VOTimeseries>
</VOSOURCE>"""

        self.pars=pars


    def write_limitmags_into_pkl(self, frame_limitmags):
        """ This parses the adt.frame_limitmags dictionary which is contained
        in a Pickle file and which was originally retrieved from
        mysql and from adt.retrieve_fullcat_frame_limitmags()
        """
        import cPickle
        import gzip
        ### This is just for writing the pickle file:
        fp = gzip.open(self.pars['limitmags_pkl_gz_fpath'],'w')
        cPickle.dump(frame_limitmags, fp, 1) # 1 means binary pkl used
        fp.close()


    def retrieve_limitmags_from_pkl(self):
        """ This parses the adt.frame_limitmags dictionary which is contained
        in a Pickle file and which was originally retrieved from
        mysql and from adt.retrieve_fullcat_frame_limitmags()
        """
        import cPickle
        import gzip
        fp = gzip.open(self.pars['limitmags_pkl_gz_fpath'],'rb')
        frame_limitmags = cPickle.load(fp)
        fp.close()
        return frame_limitmags

    
    def form_xml_string(self, mag_data_dict):
        """
    	Take timeseries dict data and place into VOSource XML format, 
        which TCP feature generation code expects.
       
        Adapted from: TCP/Software/feature_extract/format_csv_getfeats.py
        """
        
        data_str_list = []

        for i, t in enumerate(mag_data_dict['t']):
            m = mag_data_dict['m'][i]
            m_err = mag_data_dict['merr'][i]
            data_str = '              <TR row="%d"><TD>%lf</TD><TD>%lf</TD><TD>%lf</TD></TR>' % \
                (i, t, m, m_err)
            data_str_list.append(data_str)
            
        all_data_str = '\n'.join(data_str_list)
        out_xml = self.head_str + all_data_str + self.tail_str

        return out_xml


    def generate_arff_using_asasdat(self, xml_data=[], include_arff_header=False, arff_output_fp=None):
        """ Given a list of LINEAR data file filepaths, for each source/file:
        - choose the optimal aperture, depending upon median magnitude <---only for ASAS!!!
        - exclude bad/flagged epochs
        - generate features from timeseries (placing in intermediate XML-string format)
        - collect resulting features for all given sources, and place in ARFF style file
              which will later be read by ML training/classification code.
              
        Partially adapted from: TCP/Software/citris33/arff_generation_master_using_generic_ts_data.py:get_dat_arffstrs()
        """
        import tutor_database_project_insert
        adt = tutor_database_project_insert.ASAS_Data_Tools(pars=pars)
#         adt.frame_limitmags = self.retrieve_limitmags_from_pkl()


        sys.path.append(os.environ.get('TCP_DIR') + '/Software/feature_extract/MLData')
        #sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + '/Software/feature_extract/Code/extractors'))
        #print os.environ.get("TCP_DIR")
        import arffify

        sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \
                      'Software/feature_extract/Code'))
        import db_importer
        from data_cleaning import sigmaclip_sdict_ts
        sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \
                      'Software/feature_extract'))
        from Code import generators_importers

        master_list = []
        master_features_dict = {}
        all_class_list = []
        master_classes_dict = {}

        for xml_str in xml_data:
             new_srcid = xml_str['ID']
#             ts_str = open(dat_fpath).read()
#             source_intermed_dict = adt.parse_asas_ts_data_str(ts_str)
#             """mag_data_dict = adt.filter_best_ts_aperture(source_intermed_dict)
#             """
            # Need to have a function like this for LINEAR data:
            
#             xml_str = self.form_xml_string(mag_data_dict)
            
            ### Generate the features:
            signals_list = []
            gen = generators_importers.from_xml(signals_list)
            gen.generate(xml_handle=xml_str)
            gen.sig.add_features_to_xml_string(signals_list)                
            gen.sig.x_sdict['src_id'] = new_srcid
            dbi_src = db_importer.Source(make_dict_if_given_xml=False)
            dbi_src.source_dict_to_xml(gen.sig.x_sdict)

            xml_fpath = dbi_src.xml_string

            a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False)
            out_dict = a.generate_arff_line_for_vosourcexml(num=new_srcid, xml_fpath=xml_fpath)

            master_list.append(out_dict)
            all_class_list.append(out_dict['class'])
            master_classes_dict[out_dict['class']] = 0
            for feat_tup in out_dict['features']:
                master_features_dict[feat_tup] = 0 # just make sure there is this key in the dict.  0 is filler


        master_features = master_features_dict.keys()
        master_classes = master_classes_dict.keys()
        a = arffify.Maker(search=[], skip_class=False, local_xmls=True, 
                          convert_class_abrvs_to_names=False,
                          flag_retrieve_class_abrvs_from_TUTOR=False,
                          dorun=False, add_srcid_to_arff=True)
        a.master_features = master_features
        a.all_class_list = all_class_list
        a.master_classes = master_classes
        a.master_list = master_list


        a.write_arff(outfile=arff_output_fp, \
                     remove_sparse_classes=True, \
                     n_sources_needed_for_class_inclusion=1,
                     include_header=include_arff_header,
                     use_str_srcid=True)#, classes_arff_str='', remove_sparse_classes=False)
    def spawn_off_arff_line_tasks(self, vosource_xml_dirpath):
        """ This spawns off ipython task clients which
	take vosource.xml fpaths and generate feature/class structure
	which will be used to create a .arff line.
	The task results should be 'pulled' and then inserted into a final
	Weka .arff file.
	"""
        ##### For testing:
        skipped_deb_srcids = [
            '12645', '12646', '12649', '12653', '12655', '12656', '12658',
            '12660', '12670', '12675', '12700', '12745', '12766', '12797',
            '12798', '12806', '12841', '12847', '12849', '12850', '12851',
            '12852', '12853', '12854', '12856', '12858', '12861', '12864',
            '12868', '12869', '12870', '12875', '12879', '12882', '12885',
            '12886', '12888', '12890', '12891', '12893', '12895', '12901',
            '12904', '12907', '12909', '12914', '12915', '12921', '12923',
            '12924', '12928', '12930', '12932', '12933', '12934', '12936',
            '12941', '12948', '12950', '12957', '12958', '12960', '12961',
            '12970', '13007', '13024', '13034', '13059', '13076', '13078',
            '13091', '13094', '13119', '13122', '13128', '13156', '13170',
            '13172', '13239', '13242', '13246', '13247', '13261', '13268',
            '13280', '13324', '13333', '13354', '13360', '13362', '13369',
            '13374', '13402', '13418', '13420', '13421', '13423', '13424',
            '13425', '13427', '13429', '13432', '13433', '13439', '13440',
            '13442', '13443', '13444', '13448', '13458', '13462', '13465',
            '13466', '13469', '13471', '13476', '13477', '13478', '13480',
            '13481', '13483', '13484', '13491', '13493', '13495', '13500',
            '13502', '13505', '13511', '13519', '13520', '13521', '13530',
            '13535', '13543', '13544', '13552', '13553', '13560', '13561',
            '13564', '13565', '13571', '13573', '13577', '13580', '13582',
            '13591', '13594', '13596', '13602', '13607', '13608', '13616',
            '13618', '13622', '13623', '13625', '13630', '13632', '13638',
            '13642', '13646', '13647', '13650', '13656', '13657', '13668',
            '13676', '13678', '13680', '13686', '13687', '13689', '13690',
            '13692', '13694', '13695', '13698', '13701', '13703', '13704',
            '13708', '13712', '13716', '13717', '13718', '13719', '13722',
            '13723', '13731', '13733', '13739', '13740', '13743', '13744',
            '13747', '13748', '13750', '13760', '13763', '13774', '13776',
            '13777', '13780', '13782', '13783', '13784', '13786', '13788',
            '13793', '13800', '13804', '13806', '13810', '13814', '13815',
            '13819', '13824', '13826', '13832', '13833', '13838', '13843',
            '13847', '13851', '13854', '13858', '13860', '13869', '13873',
            '13881', '13882', '13885', '13888', '13889', '13890', '13892',
            '13893', '13894', '13896', '13898', '13900', '13906', '13911',
            '13922', '13927', '13928', '13929', '13936', '13938', '13942',
            '13944', '13951', '13955', '13957', '13958', '13959', '13962',
            '13965', '13972', '13974', '13988', '13989', '13996', '13997',
            '13998', '14004', '14006', '14009', '14010', '14017', '14018',
            '14024', '14025', '14028', '14029', '14032', '14035', '14043',
            '14047', '14048', '14051', '14055', '14056', '14065', '14066',
            '14070', '14071', '14072', '14087', '14088', '14089', '14093',
            '14095', '14104', '14108', '14109', '14113', '14117', '14120',
            '14122', '14125', '14129', '14133', '14136', '14137', '14151',
            '14155', '14157', '14163', '14166', '14167', '14168', '14174',
            '14175', '14181', '14182', '14186', '14191', '14194', '14198',
            '14205', '14206', '14216', '14218', '14219', '14225', '14226',
            '14234', '14239', '14243', '14244', '14246', '14247', '14248',
            '14250', '14251', '14255', '14256', '14263', '14269', '14275',
            '14280', '14282'
        ]
        import dotastro_sciclass_tools
        dst = dotastro_sciclass_tools.Dotastro_Sciclass_Tools()
        dst.make_tutor_db_connection()
        #####

        xml_fpath_list = glob.glob(vosource_xml_dirpath + '/*xml')
        # KLUDGE: This can potentially load a lot of xml-strings into memory:
        for xml_fpath in xml_fpath_list:
            fname = xml_fpath[xml_fpath.rfind('/') + 1:xml_fpath.rfind('.')]
            num = fname  # Seems OK: ?CAN I just use the filename rather than the sourceid?  # xml_fname[:xml_fname.rfind('.')]
            #srcid_xml_tuple_list.append((num, xml_fpath))

            #task_str = """cat = os.getpid()"""
            #taskid = self.tc.run(client.StringTask(task_str, pull="cat"))
            #time.sleep(1)
            #print self.tc.get_task_result(taskid, block=False).results
            #print 'yo'

            ##### For testing:
            #if "100017522.xml" in xml_fpath:
            #    print "yo"
            if 0:
                import pdb
                pdb.set_trace()
                print
                num_orig_str = str(int(num) - 100000000)
                if num_orig_str in skipped_deb_srcids:
                    #print num_orig_str
                    select_str = "select sources.source_id, sources.project_id, sources.source_name, sources.class_id, sources.pclass_id, project_classes.pclass_name, project_classes.pclass_short_name from Sources join project_classes using (pclass_id) where source_id = %s" % (
                        num_orig_str)
                    dst.cursor.execute(select_str)
                    results = dst.cursor.fetchall()

                    a = arffify.Maker(
                        search=[],
                        skip_class=False,
                        local_xmls=True,
                        convert_class_abrvs_to_names=False,
                        flag_retrieve_class_abrvs_from_TUTOR=False,
                        dorun=False)
                    out_dict = a.generate_arff_line_for_vosourcexml(
                        num=str(num), xml_fpath=xml_fpath)
                    print '!!!', results[0]
                else:
                    try:
                        a = arffify.Maker(
                            search=[],
                            skip_class=False,
                            local_xmls=True,
                            convert_class_abrvs_to_names=False,
                            flag_retrieve_class_abrvs_from_TUTOR=False,
                            dorun=False)
                        out_dict = a.generate_arff_line_for_vosourcexml(
                            num=str(num), xml_fpath=xml_fpath)
                    except:
                        print "barf on some xml:", xml_fpath

        #print xml_fpath
        #continue
        #####

            if 1:
                exec_str = """out_dict = a.generate_arff_line_for_vosourcexml(num="%s", xml_fpath="%s")
                """ % (str(num), xml_fpath)
                #print exec_str
                try:
                    taskid = self.tc.run(client.StringTask(exec_str, \
                                            pull='out_dict', retries=3))
                    self.task_id_list.append(taskid)
                except:
                    print "EXCEPT!: taskid=", taskid, exec_str