def write_arff_using_Maker(self, master_features, all_class_list, \ master_classes, master_list, \ out_arff_fpath='', \ n_sources_needed_for_class_inclusion=10): """ Use arffify.py method to write a .arrf file. """ a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False, add_srcid_to_arff=True) a.master_features = master_features a.all_class_list = all_class_list a.master_classes = master_classes a.master_list = master_list a.write_arff(outfile=out_arff_fpath, \ remove_sparse_classes=True, \ n_sources_needed_for_class_inclusion=n_sources_needed_for_class_inclusion)#, classes_arff_str='', remove_sparse_classes=False)
def __init__(self, class_schema_definition_dicts={}, class_abrv_lookup={}, \ use_weka_jvm=True, training_arff_features_list=[]): self.class_schema_definition_dicts = class_schema_definition_dicts self.training_arff_features_list = training_arff_features_list self.arffmaker = arffify.Maker(search=[], \ skip_class=True, local_xmls=True, dorun=False, \ class_abrv_lookup=class_abrv_lookup) if use_weka_jvm: # TODO/NOTE: I think a WekaClassifier() class needs to be # instantiated for each WEKA classification instance # which uses a different .model and/or training .arff # We initialize a Java virtual machine for Weka classifications #try: if not jpype.isJVMStarted(): #TODO / DEBUG: disable the next line for speed-ups once stable? _jvmArgs = ["-ea"] # enable assertions _jvmArgs.append("-Djava.class.path=" + \ os.environ["CLASSPATH"]) ###20091905 dstarr comments out: #_jvmArgs.append("-Xmx1000m") _jvmArgs.append( "-Xmx12000m" ) # 4000 & 5000m works, 3500m doesnt for some WEKA .models jpype.startJVM(jpype.getDefaultJVMPath(), *_jvmArgs) class_schema_name_list = self.class_schema_definition_dicts.keys() class_schema_name_list.remove('mlens3 MicroLens') class_schema_name_list.remove('Dovi SN') class_schema_name_list.remove('General') self.wc = {} for class_schema_name in class_schema_name_list: class_schema_dict = self.class_schema_definition_dicts[ class_schema_name] weka_training_model_fpath = class_schema_dict[ 'weka_training_model_fpath'] weka_training_arff_fpath = class_schema_dict[ 'weka_training_arff_fpath'] self.wc[class_schema_name] = weka_classifier.WekaClassifier( \ weka_training_model_fpath, weka_training_arff_fpath)
if __name__ == '__main__': ### NOTE: this __main__ section will only be executed when the python script is called like: ### python generate_arff_using_xml.py ### or ### ./generate_arff_using_xml.py xml_dirpath = "/home/dstarr/scratch/xml_list" out_arff_filepath = "/tmp/gen.arff" # to be written filepaths = glob.glob("%s/*xml" % (xml_dirpath)) vosource_list = [] for num, fpath in enumerate(filepaths): ### NOTE: I'm using 'num' as a psedudo source-id for identification in the .arff file. vosource_list.append( (str(num), fpath)) # NOTE: a tuple of this form is needed. a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False, add_srcid_to_arff=True) a.pars['skip_sci_class_list'] = [ ] # NOTE: this means that all sources will be added to the .arff, regardless of how ambigous the classification is. a.populate_features_and_classes_using_local_xmls( srcid_xml_tuple_list=vosource_list) a.write_arff(outfile=out_arff_filepath, \ remove_sparse_classes=False)#, remove_sparse_classes=True, n_sources_needed_for_class_inclusion=10) # this parameter allows you to exclude science-classes from the .arff by requiring a certain number of examples to exist.
def generate_arff_using_raw_xml(xml_str): """ This generates an arff, which contains features """ master_list = [] master_features_dict = {} all_class_list = [] master_classes_dict = {} new_srcid = 1 include_arff_header = True ### Generate the features: tmp_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') signals_list = [] gen = generators_importers.from_xml(signals_list) gen.generate(xml_handle=xml_str) gen.sig.add_features_to_xml_string(signals_list) gen.sig.x_sdict['src_id'] = new_srcid dbi_src = db_importer.Source(make_dict_if_given_xml=False) dbi_src.source_dict_to_xml(gen.sig.x_sdict) sys.stdout.close() sys.stdout = tmp_stdout xml_fpath = dbi_src.xml_string a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False) out_dict = a.generate_arff_line_for_vosourcexml(num=new_srcid, xml_fpath=xml_fpath) master_list.append(out_dict) all_class_list.append(out_dict['class']) master_classes_dict[out_dict['class']] = 0 for feat_tup in out_dict['features']: master_features_dict[ feat_tup] = 0 # just make sure there is this key in the dict. 0 is filler master_features = master_features_dict.keys() master_classes = master_classes_dict.keys() a = arffify.Maker(search=[], skip_class=True, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False, add_srcid_to_arff=True) a.master_features = master_features a.all_class_list = all_class_list a.master_classes = master_classes a.master_list = master_list fp_out = cStringIO.StringIO() a.write_arff(outfile=fp_out, \ remove_sparse_classes=True, \ n_sources_needed_for_class_inclusion=1, include_header=include_arff_header, use_str_srcid=True)#, classes_arff_str='', remove_sparse_classes=False) arff_str = fp_out.getvalue() return arff_str
def generate_arff_using_asasdat(self, data_fpaths=[], include_arff_header=False, arff_output_fp=None): """ Given a list of LINEAR data file filepaths, for each source/file: - choose the optimal aperture, depending upon median magnitude <---only for ASAS!!! - exclude bad/flagged epochs - generate features from timeseries (placing in intermediate XML-string format) - collect resulting features for all given sources, and place in ARFF style file which will later be read by ML training/classification code. Partially adapted from: TCP/Software/citris33/arff_generation_master_using_generic_ts_data.py:get_dat_arffstrs() """ import tutor_database_project_insert adt = tutor_database_project_insert.ASAS_Data_Tools(pars=pars) adt.frame_limitmags = self.retrieve_limitmags_from_pkl() sys.path.append( os.environ.get('TCP_DIR') + '/Software/feature_extract/MLData') #sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + '/Software/feature_extract/Code/extractors')) #print os.environ.get("TCP_DIR") import arffify sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \ 'Software/feature_extract/Code')) import db_importer from data_cleaning import sigmaclip_sdict_ts sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \ 'Software/feature_extract')) from Code import generators_importers master_list = [] master_features_dict = {} all_class_list = [] master_classes_dict = {} for dat_fpath in data_fpaths: new_srcid = dat_fpath[dat_fpath.rfind('/') + 1:dat_fpath.rfind('.dat')] ts_str = open(dat_fpath).read() source_intermed_dict = adt.parse_asas_ts_data_str(ts_str) """mag_data_dict = adt.filter_best_ts_aperture(source_intermed_dict) """ # Need to have a function like this for LINEAR data: xml_str = self.form_xml_string(mag_data_dict) ### Generate the features: signals_list = [] gen = generators_importers.from_xml(signals_list) gen.generate(xml_handle=xml_str) gen.sig.add_features_to_xml_string(signals_list) gen.sig.x_sdict['src_id'] = new_srcid dbi_src = db_importer.Source(make_dict_if_given_xml=False) dbi_src.source_dict_to_xml(gen.sig.x_sdict) xml_fpath = dbi_src.xml_string a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False) out_dict = a.generate_arff_line_for_vosourcexml( num=new_srcid, xml_fpath=xml_fpath) master_list.append(out_dict) all_class_list.append(out_dict['class']) master_classes_dict[out_dict['class']] = 0 for feat_tup in out_dict['features']: master_features_dict[ feat_tup] = 0 # just make sure there is this key in the dict. 0 is filler master_features = master_features_dict.keys() master_classes = master_classes_dict.keys() a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False, add_srcid_to_arff=True) a.master_features = master_features a.all_class_list = all_class_list a.master_classes = master_classes a.master_list = master_list a.write_arff(outfile=arff_output_fp, \ remove_sparse_classes=True, \ n_sources_needed_for_class_inclusion=1, include_header=include_arff_header, use_str_srcid=True)#, classes_arff_str='', remove_sparse_classes=False)
class StarVars_LINEAR_Feature_Generation: """ """ def __init__(self, pars={}): self.head_str = """<?xml version="1.0"?> <VOSOURCE version="0.04"> <COOSYS ID="J2000" equinox="J2000." epoch="J2000." system="eq_FK5"/> <history> <created datetime="2009-12-02 20:56:18.880560" codebase="db_importer.pyc" codebase_version="9-Aug-2007"/> </history> <ID>6930531</ID> <WhereWhen> <Description>Best positional information of the source</Description> <Position2D unit="deg"> <Value2> <c1>323.47114731</c1> <c2>-0.79916734036</c2> </Value2> <Error2> <c1>0.000277777777778</c1> <c2>0.000277777777778</c2> </Error2> </Position2D> </WhereWhen> <VOTimeseries version="0.04"> <TIMESYS> <TimeType ucd="frame.time.system?">MJD</TimeType> <TimeZero ucd="frame.time.zero">0.0 </TimeZero> <TimeSystem ucd="frame.time.scale">UTC</TimeSystem> <TimeRefPos ucd="pos;frame.time">TOPOCENTER</TimeRefPos> </TIMESYS> <Resource name="db photometry"> <TABLE name="v"> <FIELD name="t" ID="col1" system="TIMESYS" datatype="float" unit="day"/> <FIELD name="m" ID="col2" ucd="phot.mag;em.opt.v" datatype="float" unit="mag"/> <FIELD name="m_err" ID="col3" ucd="stat.error;phot.mag;em.opt.v" datatype="float" unit="mag"/> <DATA> <TABLEDATA> """ self.tail_str = """ </TABLEDATA> </DATA> </TABLE> </Resource> </VOTimeseries> </VOSOURCE>""" self.pars=pars def write_limitmags_into_pkl(self, frame_limitmags): """ This parses the adt.frame_limitmags dictionary which is contained in a Pickle file and which was originally retrieved from mysql and from adt.retrieve_fullcat_frame_limitmags() """ import cPickle import gzip ### This is just for writing the pickle file: fp = gzip.open(self.pars['limitmags_pkl_gz_fpath'],'w') cPickle.dump(frame_limitmags, fp, 1) # 1 means binary pkl used fp.close() def retrieve_limitmags_from_pkl(self): """ This parses the adt.frame_limitmags dictionary which is contained in a Pickle file and which was originally retrieved from mysql and from adt.retrieve_fullcat_frame_limitmags() """ import cPickle import gzip fp = gzip.open(self.pars['limitmags_pkl_gz_fpath'],'rb') frame_limitmags = cPickle.load(fp) fp.close() return frame_limitmags def form_xml_string(self, mag_data_dict): """ Take timeseries dict data and place into VOSource XML format, which TCP feature generation code expects. Adapted from: TCP/Software/feature_extract/format_csv_getfeats.py """ data_str_list = [] for i, t in enumerate(mag_data_dict['t']): m = mag_data_dict['m'][i] m_err = mag_data_dict['merr'][i] data_str = ' <TR row="%d"><TD>%lf</TD><TD>%lf</TD><TD>%lf</TD></TR>' % \ (i, t, m, m_err) data_str_list.append(data_str) all_data_str = '\n'.join(data_str_list) out_xml = self.head_str + all_data_str + self.tail_str return out_xml def generate_arff_using_asasdat(self, xml_data=[], include_arff_header=False, arff_output_fp=None): """ Given a list of LINEAR data file filepaths, for each source/file: - choose the optimal aperture, depending upon median magnitude <---only for ASAS!!! - exclude bad/flagged epochs - generate features from timeseries (placing in intermediate XML-string format) - collect resulting features for all given sources, and place in ARFF style file which will later be read by ML training/classification code. Partially adapted from: TCP/Software/citris33/arff_generation_master_using_generic_ts_data.py:get_dat_arffstrs() """ import tutor_database_project_insert adt = tutor_database_project_insert.ASAS_Data_Tools(pars=pars) # adt.frame_limitmags = self.retrieve_limitmags_from_pkl() sys.path.append(os.environ.get('TCP_DIR') + '/Software/feature_extract/MLData') #sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + '/Software/feature_extract/Code/extractors')) #print os.environ.get("TCP_DIR") import arffify sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \ 'Software/feature_extract/Code')) import db_importer from data_cleaning import sigmaclip_sdict_ts sys.path.append(os.path.abspath(os.environ.get("TCP_DIR") + \ 'Software/feature_extract')) from Code import generators_importers master_list = [] master_features_dict = {} all_class_list = [] master_classes_dict = {} for xml_str in xml_data: new_srcid = xml_str['ID'] # ts_str = open(dat_fpath).read() # source_intermed_dict = adt.parse_asas_ts_data_str(ts_str) # """mag_data_dict = adt.filter_best_ts_aperture(source_intermed_dict) # """ # Need to have a function like this for LINEAR data: # xml_str = self.form_xml_string(mag_data_dict) ### Generate the features: signals_list = [] gen = generators_importers.from_xml(signals_list) gen.generate(xml_handle=xml_str) gen.sig.add_features_to_xml_string(signals_list) gen.sig.x_sdict['src_id'] = new_srcid dbi_src = db_importer.Source(make_dict_if_given_xml=False) dbi_src.source_dict_to_xml(gen.sig.x_sdict) xml_fpath = dbi_src.xml_string a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False) out_dict = a.generate_arff_line_for_vosourcexml(num=new_srcid, xml_fpath=xml_fpath) master_list.append(out_dict) all_class_list.append(out_dict['class']) master_classes_dict[out_dict['class']] = 0 for feat_tup in out_dict['features']: master_features_dict[feat_tup] = 0 # just make sure there is this key in the dict. 0 is filler master_features = master_features_dict.keys() master_classes = master_classes_dict.keys() a = arffify.Maker(search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False, add_srcid_to_arff=True) a.master_features = master_features a.all_class_list = all_class_list a.master_classes = master_classes a.master_list = master_list a.write_arff(outfile=arff_output_fp, \ remove_sparse_classes=True, \ n_sources_needed_for_class_inclusion=1, include_header=include_arff_header, use_str_srcid=True)#, classes_arff_str='', remove_sparse_classes=False)
def spawn_off_arff_line_tasks(self, vosource_xml_dirpath): """ This spawns off ipython task clients which take vosource.xml fpaths and generate feature/class structure which will be used to create a .arff line. The task results should be 'pulled' and then inserted into a final Weka .arff file. """ ##### For testing: skipped_deb_srcids = [ '12645', '12646', '12649', '12653', '12655', '12656', '12658', '12660', '12670', '12675', '12700', '12745', '12766', '12797', '12798', '12806', '12841', '12847', '12849', '12850', '12851', '12852', '12853', '12854', '12856', '12858', '12861', '12864', '12868', '12869', '12870', '12875', '12879', '12882', '12885', '12886', '12888', '12890', '12891', '12893', '12895', '12901', '12904', '12907', '12909', '12914', '12915', '12921', '12923', '12924', '12928', '12930', '12932', '12933', '12934', '12936', '12941', '12948', '12950', '12957', '12958', '12960', '12961', '12970', '13007', '13024', '13034', '13059', '13076', '13078', '13091', '13094', '13119', '13122', '13128', '13156', '13170', '13172', '13239', '13242', '13246', '13247', '13261', '13268', '13280', '13324', '13333', '13354', '13360', '13362', '13369', '13374', '13402', '13418', '13420', '13421', '13423', '13424', '13425', '13427', '13429', '13432', '13433', '13439', '13440', '13442', '13443', '13444', '13448', '13458', '13462', '13465', '13466', '13469', '13471', '13476', '13477', '13478', '13480', '13481', '13483', '13484', '13491', '13493', '13495', '13500', '13502', '13505', '13511', '13519', '13520', '13521', '13530', '13535', '13543', '13544', '13552', '13553', '13560', '13561', '13564', '13565', '13571', '13573', '13577', '13580', '13582', '13591', '13594', '13596', '13602', '13607', '13608', '13616', '13618', '13622', '13623', '13625', '13630', '13632', '13638', '13642', '13646', '13647', '13650', '13656', '13657', '13668', '13676', '13678', '13680', '13686', '13687', '13689', '13690', '13692', '13694', '13695', '13698', '13701', '13703', '13704', '13708', '13712', '13716', '13717', '13718', '13719', '13722', '13723', '13731', '13733', '13739', '13740', '13743', '13744', '13747', '13748', '13750', '13760', '13763', '13774', '13776', '13777', '13780', '13782', '13783', '13784', '13786', '13788', '13793', '13800', '13804', '13806', '13810', '13814', '13815', '13819', '13824', '13826', '13832', '13833', '13838', '13843', '13847', '13851', '13854', '13858', '13860', '13869', '13873', '13881', '13882', '13885', '13888', '13889', '13890', '13892', '13893', '13894', '13896', '13898', '13900', '13906', '13911', '13922', '13927', '13928', '13929', '13936', '13938', '13942', '13944', '13951', '13955', '13957', '13958', '13959', '13962', '13965', '13972', '13974', '13988', '13989', '13996', '13997', '13998', '14004', '14006', '14009', '14010', '14017', '14018', '14024', '14025', '14028', '14029', '14032', '14035', '14043', '14047', '14048', '14051', '14055', '14056', '14065', '14066', '14070', '14071', '14072', '14087', '14088', '14089', '14093', '14095', '14104', '14108', '14109', '14113', '14117', '14120', '14122', '14125', '14129', '14133', '14136', '14137', '14151', '14155', '14157', '14163', '14166', '14167', '14168', '14174', '14175', '14181', '14182', '14186', '14191', '14194', '14198', '14205', '14206', '14216', '14218', '14219', '14225', '14226', '14234', '14239', '14243', '14244', '14246', '14247', '14248', '14250', '14251', '14255', '14256', '14263', '14269', '14275', '14280', '14282' ] import dotastro_sciclass_tools dst = dotastro_sciclass_tools.Dotastro_Sciclass_Tools() dst.make_tutor_db_connection() ##### xml_fpath_list = glob.glob(vosource_xml_dirpath + '/*xml') # KLUDGE: This can potentially load a lot of xml-strings into memory: for xml_fpath in xml_fpath_list: fname = xml_fpath[xml_fpath.rfind('/') + 1:xml_fpath.rfind('.')] num = fname # Seems OK: ?CAN I just use the filename rather than the sourceid? # xml_fname[:xml_fname.rfind('.')] #srcid_xml_tuple_list.append((num, xml_fpath)) #task_str = """cat = os.getpid()""" #taskid = self.tc.run(client.StringTask(task_str, pull="cat")) #time.sleep(1) #print self.tc.get_task_result(taskid, block=False).results #print 'yo' ##### For testing: #if "100017522.xml" in xml_fpath: # print "yo" if 0: import pdb pdb.set_trace() print num_orig_str = str(int(num) - 100000000) if num_orig_str in skipped_deb_srcids: #print num_orig_str select_str = "select sources.source_id, sources.project_id, sources.source_name, sources.class_id, sources.pclass_id, project_classes.pclass_name, project_classes.pclass_short_name from Sources join project_classes using (pclass_id) where source_id = %s" % ( num_orig_str) dst.cursor.execute(select_str) results = dst.cursor.fetchall() a = arffify.Maker( search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False) out_dict = a.generate_arff_line_for_vosourcexml( num=str(num), xml_fpath=xml_fpath) print '!!!', results[0] else: try: a = arffify.Maker( search=[], skip_class=False, local_xmls=True, convert_class_abrvs_to_names=False, flag_retrieve_class_abrvs_from_TUTOR=False, dorun=False) out_dict = a.generate_arff_line_for_vosourcexml( num=str(num), xml_fpath=xml_fpath) except: print "barf on some xml:", xml_fpath #print xml_fpath #continue ##### if 1: exec_str = """out_dict = a.generate_arff_line_for_vosourcexml(num="%s", xml_fpath="%s") """ % (str(num), xml_fpath) #print exec_str try: taskid = self.tc.run(client.StringTask(exec_str, \ pull='out_dict', retries=3)) self.task_id_list.append(taskid) except: print "EXCEPT!: taskid=", taskid, exec_str