class ExampleInstanceExamer(object): def __init__(self): self.example_instance = None self.feature_set = None def load(self, example_file, feature_set_file): self.example_instance = ExampleInstance() self.example_instance.from_xml_file(example_file) self.feature_set = FeatureSet() self.feature_set.from_xml_file(feature_set_file) def exam(self): factor_list = self.example_instance.get_factors() for factor in factor_list: #check words in factor if len(factor.words)==0: print WarningTypes["NO_WORD"] + "\n " + str(factor) + "\n" for w in factor.words: if self.feature_set.has_word_feature(w) == False: return ErrorTypes["NO_WORD_FEATURE"] + "\n " + str(factor) + "\n " + str(w) + "\n" #check groundings in factor if len(factor.groundings.groundings)==0: print WarningTypes["NO_GROUNDING"] + "\n " + str(factor) + "\n" for g in factor.groundings.groundings: if g.grounding_type == "object": if False == self.example_instance.world.has_object(g.type, g.name): return ErrorTypes["NO_OBJECT_IN_WORLD"] + "\n " + str(factor) + "\n " + str(g) +"\n" #elif False == self.feature_set.has_object_type_feature(g): # return ErrorTypes["NO_OBJECT_FEATURE"] + "\n " + str(factor) + "\n " + str(g) +"\n" elif g.grounding_type == "constraint": if None==g.parent or None==g.child: return ErrorTypes["NO_OBJECT"] + "\n " + str(factor) + "\n " + str(g) + "\n" elif False == self.example_instance.world.has_object(g.parent.type, g.parent.name): return ErrorTypes["NO_OBJECT_IN_WORLD"] + "\n [parent] " + str(factor) + "\n " + str(g) +"\n" elif False == self.example_instance.world.has_object(g.child.type, g.child.name): return ErrorTypes["NO_OBJECT_IN_WORLD"] + "\n [child] " + str(factor) + "\n " + str(g) +"\n" elif False == self.example_instance.world.has_object(g.parent.type, g.parent.name): return ErrorTypes["NO_OBJECT_IN_WORLD"] + "\n [parent] " + str(factor) + "\n " + str(g) +"\n" elif False == self.example_instance.world.has_object(g.child.type, g.child.name): return ErrorTypes["NO_OBJECT_IN_WORLD"] + "\n [child] " + str(factor) + "\n " + str(g) +"\n" elif g.grounding_type == "func_kernel": if None==g.object: return ErrorTypes["NO_OBJECT"] + "\n " + str(factor) + "\n " + str(g) + "\n" elif False == self.example_instance.world.has_object(g.object.type, g.object.name): return ErrorTypes["NO_OBJECT_IN_WORLD"] + "\n [parent] " + str(factor) + "\n " + str(g) +"\n" return "Good"
def runFull(): nbrUserids = KNNSearch.Search([ 1.0 , -1.2, 1.0, 7.79 ], 2000) # print "Neighbours\n" # pprint.pprint(nbrUserids) split = FeatureSet.featureExtract(nbrUserids) # print "split\n" # pprint.pprint(split) # testData = getTestData() # print "Test Data\n" # pprint.pprint(testData); # # sys.exit(0) featureSet = split[0][0] interested = split[0][1] notinterested = split[0][2] z = [True] * len(featureSet[0]) w = [True] * len(featureSet[0]) C = 0.03 #C = 0.3 model = Model(compress=z, has_none=w, C=C) model.fit(featureSet, interested) testData = getTestData() result = runModel(model, testData) print result
def getTestData(): test = pd.read_csv("../Input/train2.csv") testDict = {} for record in test.iterrows(): record = record[1] uid = record['user'] if uid not in testDict: testDict[uid] = [] testDict[uid].append({ 'eid': record['event'], 'invited': record['invited'] }) testData = {} for uid, events in testDict.iteritems(): eDict = {e['eid']: (e['invited']) for e in events} features_dict = FeatureSet.process(uid, eDict) X = [] for e in events: eid = e['eid'] X.append(features_dict[eid]) testData[uid] = { 'X': X, 'events': events } return testData
def Features(self, mode = None, filterExpression = None): """ 获取一个包含多个Feature的迭代器。 参数: mode: 打开方式。 OpenMode.ReadOnly: 以pGP.SearchCursor的方式打开 OpenMode.WriteOnly: 以pGP.InsertCursor的方式打开 OpenMode.ReadWrite: 以pGP.UpdateCursor的方式打开 filter:集合过滤器。参见ESRI的WhereClause选项 """ if not self.isValid_: return FeatureSet(None, OpenMode.Invalid, self) if self.currentFeatureSet_: self.currentFeatureSet_.SetInvalid() if mode == None: mode = OpenMode.ReadOnly if mode == OpenMode.ReadOnly: if filterExpression: rows = pGP.SearchCursor(self.path_, filterExpression) else: rows = pGP.SearchCursor(self.path_) self.currentFeatureSet_ = FeatureSet(rows, mode, self) elif mode == OpenMode.WriteOnly: if filterExpression: rows = pGP.InsertCursor(self.path_, filterExpression) else: rows = pGP.InsertCursor(self.path_) self.currentFeatureSet_ = FeatureSet(rows, mode, self) elif mode == OpenMode.ReadWrite: if filterExpression: rows = pGP.UpdateCursor(self.path_, filterExpression) else: rows = pGP.UpdateCursor(self.path_) self.currentFeatureSet_ = FeatureSet(rows, mode, self) return self.currentFeatureSet_
class FeatureClass: def __init__(self, featclsName): self._Reset() self.path_ = featclsName if pGP.Exists(featclsName): desc = pGP.Describe(featclsName) fcprops = get_featureclass_properties(featclsName) if desc.DatasetType.lower() != "featureclass": return self.isValid_ = True self.idFieldName_ = fcprops.sFID self.shapeFieldName_ = fcprops.sShapeField self.shapeType_ = fcprops.sShapeType self.spatialRef_ = fcprops.pSpatialRef self.dir_ = os.path.split(self.path_)[0] self.baseName_ = os.path.split(self.path_)[1] self.count_ = pGP.GetCount_management(self.path_) self.desc_ = desc def _Reset(self): self.isValid_ = False self.dir_ = None self.baseName_ = None self.count_ = None self.idFieldName_ = None self.shapeFieldName_ = None self.shapeType_ = None self.spatialRef_ = None self.desc_ = None self.currentFeatureSet_ = None def Directory(self): return self.dir_ def BaseName(self): return self.baseName_ def ShapeTypeName(self): return self.shapeType_ def SetShapeTypeName(self, typeName): if self.isValid_: return False self.shapeType_ = typeName return True def ShapeFieldName(self): return self.shapeFieldName_ def IDFieldName(self): return self.idFieldName_ def SpatialRef(self): return self.spatialRef_ def SetSpatialRef(self, spatialRef): self.spatialRef_ = spatialRef def ArcGISDescription(self): return self.desc_ def AddField(self, field_name, data_type, *args): validFieldName = pGP.ValidateFieldName(field_name) pGP.AddField_management( self.path_, validFieldName, data_type, *args ) return validFieldName def Features(self, mode = None, filterExpression = None): """ 获取一个包含多个Feature的迭代器。 参数: mode: 打开方式。 OpenMode.ReadOnly: 以pGP.SearchCursor的方式打开 OpenMode.WriteOnly: 以pGP.InsertCursor的方式打开 OpenMode.ReadWrite: 以pGP.UpdateCursor的方式打开 filter:集合过滤器。参见ESRI的WhereClause选项 """ if not self.isValid_: return FeatureSet(None, OpenMode.Invalid, self) if self.currentFeatureSet_: self.currentFeatureSet_.SetInvalid() if mode == None: mode = OpenMode.ReadOnly if mode == OpenMode.ReadOnly: if filterExpression: rows = pGP.SearchCursor(self.path_, filterExpression) else: rows = pGP.SearchCursor(self.path_) self.currentFeatureSet_ = FeatureSet(rows, mode, self) elif mode == OpenMode.WriteOnly: if filterExpression: rows = pGP.InsertCursor(self.path_, filterExpression) else: rows = pGP.InsertCursor(self.path_) self.currentFeatureSet_ = FeatureSet(rows, mode, self) elif mode == OpenMode.ReadWrite: if filterExpression: rows = pGP.UpdateCursor(self.path_, filterExpression) else: rows = pGP.UpdateCursor(self.path_) self.currentFeatureSet_ = FeatureSet(rows, mode, self) return self.currentFeatureSet_ def Flush(self): if self.currentFeatureSet_: self.currentFeatureSet_.Flush() def Count(self): if self.isValid_: return self.count_
''' Created on Oct 9, 2015 @author: ydq ''' from FeatureSet import * if __name__ == '__main__': FILE = "feature_set.xml" feature_set = FeatureSet() feature_set.from_xml_file(FILE) print feature_set
def generate_synthetic_data(data_path, seq_lengths, samples_per_class, num_files, sequences_type="default", features_set="default", kwargs="default"): """generating synthetic time series data with our simple, uniqe method. Parameters: data_path : str. Path to the where the generated data will be saved. seq_lengths : list. List of all time series sequences length to be generated. e.g [50, 100] will generate data with 50 and 100 sequence length. samples_per_class : int How many instances will be generated to each class. num_files : int How many files to be created. e.g if we are using 10 sequences, 10 samples per class, 2 files, we will have totally 2 * (10 * 10) samples of synthetic data. Why not create all samples in same file? because of memory issues. we split so that each file will contain ~500K samples. sequences_type : list or str, default = "default" Which kind of sequnces type to be created. e.g ["Up", "HighPeak", "SinWave"]. if default - take all possible sequences. this is the recommended option. features_set : list or str, default = "default" Same to sequences_type but regarding which features to be used as target for training (the "y"). e.g ["Max", "Min", "Peaks"] if default - take all possible features. this is the recommended option. Returns ------- None, cause all files will be written to disk. """ path = data_path num_samples = samples_per_class num_times = num_files all_sequences_type = sequences.get_all_sequences( ) if sequences_type == "default" else sequences_type feature_set = fs.get_all_features( ) if features_set == "default" else features_set for i in range(num_times): for seq in seq_lengths: if not os.path.exists(path + str(seq)): os.makedirs(path + str(seq)) if not os.path.exists(path + "{}/{}".format(seq, num_samples)): os.makedirs(path + "{}/{}".format(seq, num_samples)) y = [] for sequence_type in all_sequences_type: kwargs = get_kwargs(sequence_type, num_samples, seq) y_samples = eval( "sequences.{}(**kwargs).generate_data()".format( sequence_type)) y.append(y_samples) y = np.array([np.array(yi) for yi in y]) y = y.reshape(len(y) * y[0].shape[0], y[0].shape[1]) file_name = path + "{}/{}/{}_part{}_x_test.gz".format( seq, num_samples, seq, i) dump(y, file_name) y = fs.create_regression_tastks_no_multi(y, feature_set) file_name = path + "{}/{}/{}_part{}_y_test.gz".format( seq, num_samples, seq, i) dump(y, file_name) y = None
def load(self, example_file, feature_set_file): self.example_instance = ExampleInstance() self.example_instance.from_xml_file(example_file) self.feature_set = FeatureSet() self.feature_set.from_xml_file(feature_set_file)