def associate_data(self): """ add a new property to the Journal: datas at one point this was generated automaticaly, but that slowed things down this allows it to be generated when it is needed which so far is only when then node.directory object is looking for a default image """ self._datas = Association() for entry in self._entries: self._datas.associate(entry, entry.data)
def _check_entry_product(product, product_type): if product_type == 'specialities': return Speciality(**product) elif product_type == 'substances': return Substance(**product) elif product_type == 'associations': return Association(**product) else: raise ValueError()
def regles_asso(self): liste_regles = [] #si l'itemset est de taille 1, return une liste vide if len(self) != 1: for item in self: antecedent = Itemset(self - Itemset([item])) consequent = Itemset([item]) asso = Association(antecedent, consequent) liste_regles.append(asso) return liste_regles
def __init__(self, path=None, items=[], title=None, debug=False): self._entries = [] #keys are compact date string self._dates = Association() #keys are tag name self._tags = Association() # renamed self.name to self.path # then use name as a general name for the journal, if displayed # *2009.11.07 10:28:44 # using title instead of name, then if anything else is still # using name the old way, it will flag a more recognizable error self.title = title self.debug = debug #*2011.06.26 13:45:20 #really no such thing as a default path (self.path) # used for default file path: # convert to string just incase a Path object is sent #self.path = str(path) #if we want to store it to a path, then should specify that #in a to_file / save call #otherwise should keep track of all paths loaded #so that we can reload them later. self.loaded = [] #or ... #could index based on original source too #this would allow them to be re-saved to their original source self._sources = Association() if path: self.load(path) if items: self.update_entries(items)
def clear(self): """ clear mind start fresh in practice it's probably easier to just create a new journal but reload might need this """ del self._entries self._entries = [] del self._tags self._tags = Association() del self._dates self._dates = Association() self.loaded = [] self.sources = Association() #todo #way to see how much memory is consumed by current process? #should show before and after if so return True
def associate_files(self): """ add a new property to the Journal: files similar to associate_datas but checks each entry's data for path information if path is found just take the filename portion and associate the entry with that portion otherwise associate each line of the entry's data (as is) """ self._files = Association() for entry in self._entries: lines = entry.data.splitlines() for line in lines: if re.search('/', line): name = os.path.basename(line) self._files.associate(entry, name) elif line.strip(): self._files.associate(entry, line.strip()) else: #must be a blank line pass
class Journal(object): """ Main Moments module for collecting Moment entries in one place *2011.06.26 13:50:20 not sure that it even makes sense to base this on a standard list should use a list internally but we really don't use any of the methods for a list to interact with a Journal object so we could have self._entries self._tags self._dates to store everything internally and then use custom methods for interacting with the Journal these methods should be the same whether the Journal is a local, native, instance, or if it is remote. i.e. using journal object attributes directly in code is discouraged to ensure that local and remote journal objects work identically """ def __init__(self, path=None, items=[], title=None, debug=False): self._entries = [] #keys are compact date string self._dates = Association() #keys are tag name self._tags = Association() # renamed self.name to self.path # then use name as a general name for the journal, if displayed # *2009.11.07 10:28:44 # using title instead of name, then if anything else is still # using name the old way, it will flag a more recognizable error self.title = title self.debug = debug #*2011.06.26 13:45:20 #really no such thing as a default path (self.path) # used for default file path: # convert to string just incase a Path object is sent #self.path = str(path) #if we want to store it to a path, then should specify that #in a to_file / save call #otherwise should keep track of all paths loaded #so that we can reload them later. self.loaded = [] #or ... #could index based on original source too #this would allow them to be re-saved to their original source self._sources = Association() if path: self.load(path) if items: self.update_entries(items) #*2011.06.26 13:53:03 #should there be a to_file #and also a to_original_files (or something like that) ? #could loop through all of self.loaded and store changes to those #entries affected in those sources #aka to_file def save(self, filename=None, order='original', include_path=False): """ >>> from entry import Entry >>> e = Entry("test entry") >>> j.add_entry(e) >>> j.to_file("sample_log2.txt") >>> k = Journal() >>> k.load("sample_log2.txt") >>> len(k.entries) 2 """ if filename: self.path = str(filename) if hasattr(self, "path") and self.path: l = Log(self.path) else: print "No name to save Journal to" exit() #l.from_journal(self, holder, entry) l.from_entries(self.sort(order=order), include_path=include_path) l.to_file() l.close() def save_originals(self): """ loop through all self.sources or self.loaded and save the corresponding entries back (only if there is an actual change???) might want to return any entries that don't have a destination or would it be better to return an error? or not save if entries don't have a destination """ pass def save_instance(self, instance_file): """ save the currently loaded sources to an instance file """ pass def load_instance(self, instance_name, instance_file): """ load the first entry tagged instance_name from the instance file """ pass #aka open, etc #formerly: from_file, add_log_to_journal, add_file def load(self, log_name, add_tags=[]): """ adds a log file to the journal object currently in memory this can be called multiple times with different filenames to merge those files/entries into the journal >>> from journal import * >>> j = Journal() >>> j.load("sample_log.txt") >>> len(j.entries) 1 return True if the file was able to be loaded False if it was not a journal/Log file """ found_entries = 0 if not str(log_name) in self.loaded: self.loaded.append(str(log_name)) #TODO: #should also handle adding entry to self._sources?? #or will that happen in update_entries #would it be better for sources and loaded to be associated #on an update? #that way if entries are added outside of a load #the sources would still get updated. l = Log() l.from_file(str(log_name)) entries = l.to_entries(add_tags) #print "%s entries loaded from file" % len(entries) #print "%s entries in self before merging in entries" % len(self) self.update_many(entries) #print "%s entries in self after merging in entries" % len(self) #if l.has_entries: found_entries = len(entries) l.close() return found_entries def reload(self): """ create a new instance of a journal based on the paths we have previously loaded (self.loaded) load everything that was previously loaded then swap out the contents old journal for the new one """ #use this to load new _entries, etc temp_j = Journal() for item in self.loaded: new_j.load(item) #temp_j = load_journal(item) #new_j.from_entries(temp_j.entries()) #del temp_j old_entries = self._entries old_tags = self._tags old_dates = self._dates old_sources = self._sources self._entries = new_j._entries self._tags = new_j._tags self._dates = new_j._dates self._sources = new_j._sources del old_entries del old_tags del old_dates del old_sources def _add(self, entry, position=None): """ this is the base case for adding an entry blindly adds the entry object to the journal's list of entries no checks are performed will add multiple copies of the same entry to the journal use update to avoid duplicates """ if position is None: #cannot assume insert here... #insert(0, entry) reverses the list order on log read self._entries.append(entry) else: self._entries.insert(position, entry) if hasattr(entry, "created") and entry.created: entry_time = entry.created.compact() self._dates.associate(entry, entry_time) else: self._dates.associate(entry, None) for t in entry.tags: self._tags.associate(entry, t) #TODO: #integrate source def update(self, entry, position=None, source=None): """ checks if an entry already exists in the journal if other entries in with that time stamp are similar, see if they can be merged easily (i.e. only tags differ) otherwise just add it as a separate entry no longer attempting to choose which one to keep here since journal can hold multiple entries with the same timestamp can merge later as needed using dedicated script for that purpose """ if not hasattr(entry, "created") or not entry.created: if entry not in self._entries: self._add(entry, position) if self.debug: print "Entry has no time associated, and no other entry found. added" else: #this makes entry_time available in the event the entry already #is in the journal: #print entry.created entry_time = entry.created.compact() if entry not in self._entries: if not self._dates.has_key(entry_time): self._add(entry, position) if self.debug: print "No other entry found with time: %s. added" % entry_time elif self._dates.has_key(entry_time): #it must have *something* in that time slot #check for duplicates if self.debug: print "Other entries found with time: %s. checking all.." % entry_time options = self._dates[entry_time] found_match = False for existing in options: if existing.is_equal(entry, debug=self.debug): #print "DUPE, but tags and data are same... skipping" found_match = True if self.debug: print "Equal entry found. Skipping" #only want to merge if we have data #otherwise blank entries can end up grouped together elif entry.data and (existing.data == entry.data): #tags must differ... those are easy to merge: print "from: %s, %s" % (existing.path, existing.created) print "and: %s, %s" % (entry.path, entry.created) print "only TAGS differ" print "original: %s" % existing.tags print "new: %s" % entry.tags existing.tags.union(entry.tags) print "merged: %s" % existing.tags found_match = True else: #this one didn't match #but we won't add the entry until we've checked them all pass if not found_match: #2009.12.04 16:03:15 #this information doesn't help much anymore: #print "MULTIPLE ENTRIES EXISTS AT: %s" % (entry_time) #print "but none matched this one. Adding now" self._add(entry, position) if self.debug: print "No equivalent entries found. adding" else: if self.debug: print "Entry (%s) already exists in journal" % entry_time #aka create, new def make(self, data, tags=[], created=None, source='', position=0): """ helper for making a new entry right in a journal object this way should not need to import moments.entry.Entry elsewhere """ if not created: created = datetime.now() entry = Moment(data, tags, created, path=source) #print "Journal.make.position: %s" % position self.update(entry, position=position) return entry #AKA DELETE def remove(self, entry): """ remove associations from self._dates and self._tags then remove the entry from the journal. """ #text_time = str(entry.created) #text_time = e.created.strftime(time_format) self._tags.remove(entry) self._dates.remove(entry) #remove from the list of entries self._entries.remove(entry) #*2011.07.09 10:32:42 #is this ever used? #seems dangerous to remove everything at a given timestamp #more likely to add as a separate one #or remove explicitly and then add/update/make ## def replace(self, entry): ## """ ## remove all entries from the journal with the same timestamp as entry ## then add the new entry to the journal ## i.e. ## accepts a new entry ## and uses it to find and then remove the original one(s) ## add the new one to the journal ## thereby replacing the original(s) ## """ ## entry_time = entry.created.compact() ## if self._dates.has_key(entry_time): ## options = self._dates[entry_time] ## else: ## options = [] ## for existing in options: ## self.remove(existing) ## self._add(entry) #aka from_entries #aka add_entries #aka update_entries def update_many(self, entries, source=None): """ loop over a list of entries to add/update each one to the journal """ for e in entries: self.update(e, source=source) #aka remove_entries def remove_many(self, entries): """ take a list of entry objects, remove each one """ for e in entries: self.remove(e) #Following are all different ways to READ #they are also closely related to the hidden properties: #_tags, _dates, _entries #*2011.07.05 21:14:47 #thinking that it makes sense to have two separate calls #could combine tag and tags (etc) #by returning the plural version (dict) when no tag specified #but the function name is unclear in that case def tag(self, tag_key=None): """ lookup tag_key in self._tags should only return a list of entries associated with that tag not a dict with the tag name server can do that but server needs to be a little different """ #print self._tags.keys() if tag_key and self._tags.has_key(tag_key): #print self._tags[tag_key] ## moments = [] ## for m in self._tags[tag_key]: ## #instead of rendering a string: ## #moments.append(m.render()) ## #supply a dictionary of the moment item ## moments.append(m.as_dict()) #return { tag_key:self._tags[tag_key] } return self._tags[tag_key] ## elif tag_key: ## #must not have any content associated with this tag ## return { tag_key:[] } else: #could also return self.tags() #return self.tags() #return { 'tags': self._tags.keys() } #return { tag_key:[] } return [] def tags(self, tags=[]): """ return a dictionary with: all tags as keys, and number of entries for each tag as values *2011.07.10 10:38:07 also could use mindstream.entries_tagged to accept a list of tags and combine all of those entries into a single list and return that """ if tags: #*2011.11.09 11:42:38 #if there is only one tag #should we just call self.tag()??? if not isinstance(tags, list): tags = [tags] found_entries = Journal() for t in tags: if self._tags.has_key(t): #print len(self._tags[t]) found_entries.update_many(self._tags[t]) found_entries = found_entries.sort("reverse-chronological") #return found_entries._entries return found_entries else: tdict = {} for tag in self._tags.keys(): tdict[tag] = len(self._tags[tag]) return tdict def date(self, date_key=None): """ lookup date_key in self._dates date_key should be compact stamp """ if date_key: if isinstance(date_key, Timestamp): ts = date_key else: ts = Timestamp(compact=date_key) #print ts, type(ts) #print ts.accuracy if ts.accuracy and ts.accuracy != "second": rr = Timerange(ts) #get the timerange tr = rr.default() #print tr #print tr.start.datetime #print tr.end.datetime entries = self.range(tr.start, tr.end) #return {ts.compact():entries} return entries elif self._dates.has_key(ts.compact()): entries = self._dates[ts.compact()] #print "LEN ENTRIES: %s" % len(entries) #print entries #return { ts.compact():entries } return entries else: #return { ts.compact():[] } return [] else: #could also return self.dates() #return self.dates() #return { date_key:[] } return [] def dates(self): """ return a dictionary with: all dates as keys, and number of entries for each date as values """ ddict = {} for key in self._dates.keys(): #print "KEY:", key #key might be blank here (i.e. no timestamp) if key: ts = Timestamp(compact=key) #print ts ddict[ts.compact()] = len(self._dates[key]) else: ddict[key] = len(self._dates[key]) return ddict #aka item??? def entry(self, index=None): """ return the item at index point in list is this already defined on a list object? should be consistent """ if len(self._entries) > index: return self._entries[index] else: return None def entries(self): """ return a list of entries making a function call rather than an attribute to make consistent between local and remote calls """ return self._entries def related(self, key): """ look for tags if no matching tags see if it is a date string (get range, find entries there) either way return tags that are related (maybe as a dictionary {'tag':number_of_items} ... same as self._tags) """ #make sure we have it, otherwise nothing relates if not self._tags.has_key(key): return [] entries = self._tags[key] related = [] for e in entries: #todo: #could also generate a cloud #ranking most common related higher for t in e.tags: if t not in related: related.append(t) return related def search(self, look_for, data=False, limit=0): """ scan tags for tags matching (searching) look_for if data is True, look in entry.data too """ tags = self._tags.keys() found = [] # in this case, we'll return the whole entry if data: for e in self._entries: if re.search(look_for, e.data): found.append(e) else: for t in e.tags(): if re.search(look_for, t): found.append(e) # in this case we'll only return matching tags else: results = [] for t in tags: if re.search(look_for, t): results.append(t) ## #now look for the results that start with "look_for" ## matches = [] ## for r in results: ## if re.match(look_for, r): ## matches.append(r) ## results.remove(r) # sort tags by the number of entries they have priority = [] for tag in results: priority.append((len(self._tags[tag]), tag)) priority.sort() priority.reverse() #print "Priority: %s" % priority for p in priority: found.append(p[1]) if limit: found = found[:int(limit)] return found def sort(self, order='original'): """ Sorts the items in our Journal's ._entries list returns a list of the rearranged order of the entries in the journal can specify order: 'original' to keep the original order that the entries were added to the journal 'reverse' 'chronological' or 'oldest to newest' oldest entries first in the list 'reverse-chronological' or 'newest to oldest' if not all entries are wanted, see self.range() """ #print order if order == "original": return self._entries elif order == "reverse": self._entries.reverse() return self._entries else: entry_times = self._dates.keys() if order == "reverse-chronological" or order == 'newest to oldest': entry_times.sort() entry_times.reverse() elif order == "chronological" or order == 'oldest to newest': if self.debug: print "oldest to newest" entry_times.sort() if self.debug: print entry_times else: raise ValueError, "Unknown sort option supplied: %s" % order entries = [] for et in entry_times: elist = self._dates[et] for entry in elist: entries.append(entry) assert len(entries) == len(self._entries) del self._entries self._entries = entries return entries #aka limit, timerange, mindstream.time_range def range(self, start=None, end=None): """ if no start *and* end specified return the time range for the entries in the currently loaded journal if only start return the entries in range for the accuracy of the start (e.g. 1 day) if start and end return all entries in the journal that fall in that range should accept a string, a datetime object, or a Timestamp object """ if start is None and end is None: dates = self._dates.keys() dates.sort() start = dates[0] end = dates[-1] #might have entries with no timestamp first: if start is None: start = dates[1] print start, end return Timerange(start=start, end=end) else: start = Timestamp(start) if end: end = Timestamp(end) else: relative = Timerange(start) end = relative.end times = self._dates.keys() times.sort() matches = [] for t in times: #not sure why we're using just time here #seems like we would want to use the date too? #pytime = Timestamp(t).time #sometimes t is None... those don't fit in a range. if t: pytime = Timestamp(t).datetime if (pytime >= start.datetime) and (pytime <= end.datetime): matches.extend(self._dates[t]) return matches def clear(self): """ clear mind start fresh in practice it's probably easier to just create a new journal but reload might need this """ del self._entries self._entries = [] del self._tags self._tags = Association() del self._dates self._dates = Association() self.loaded = [] self.sources = Association() #todo #way to see how much memory is consumed by current process? #should show before and after if so return True def associate_data(self): """ add a new property to the Journal: datas at one point this was generated automaticaly, but that slowed things down this allows it to be generated when it is needed which so far is only when then node.directory object is looking for a default image """ self._datas = Association() for entry in self._entries: self._datas.associate(entry, entry.data) def associate_files(self): """ add a new property to the Journal: files similar to associate_datas but checks each entry's data for path information if path is found just take the filename portion and associate the entry with that portion otherwise associate each line of the entry's data (as is) """ self._files = Association() for entry in self._entries: lines = entry.data.splitlines() for line in lines: if re.search('/', line): name = os.path.basename(line) self._files.associate(entry, name) elif line.strip(): self._files.associate(entry, line.strip()) else: #must be a blank line pass
def autofis_onecv(file_zip, file_train, file_test, parameters): # General parameters t_norm = parameters[3] max_size_of_premise = parameters[5] association_method = parameters[11] aggregation_method = parameters[12] # Gathering parameters # Formulation parameters: par_area, par_over, par_pcd = toolfis.get_formulation_parameters(parameters) # 1. Lecture & Fuzzification out1 = toolfis.lecture_fuz_one_cv(file_zip, file_train, file_test, parameters) ux_train, cbin_train = out1[0] ux_test, cbin_test = out1[1] num_premises_by_attribute, premises_by_attribute, ref_attributes, premises_contain_negation = out1[2] freq_classes = out1[3] report = [] # To save our results try: # 3. Formulation f2 = Formulation(ux_train, cbin_train, ref_attributes, premises_by_attribute, num_premises_by_attribute, premises_contain_negation) # Inputs given by user arbol = f2.gen_ARB(max_size_of_premise, t_norm, par_area, par_over, par_pcd) status = [0 if not i[0] else 1 for i in arbol] sum_status = sum(status) if sum_status != len(arbol): if sum_status == 0: raise ValueError("Error in Formulation Module. Any premise survived. " "Sorry, you can not continue in the next stage." "\nTry to change the configuration") else: arb = [i for i in arbol if i[0]] arbol, arb = arb, arbol number_classes = cbin_train.shape[1] report.append("\nFormulation:\n-----------------") report.append("Elementos acorde a la profundidad " + str(len(arbol)) + " del arbol") for i in range(len(arbol)): report.append('Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape)) # print 'Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape) # 4. Association: ex-Division f3 = Association(arbol, cbin_train) premises_ux_by_class = f3.division(association_method) status = [0 if not i[0] else 1 for i in premises_ux_by_class] if sum(status) != number_classes: raise ValueError("Error in Division Module. Some classes did not get premises. " "Sorry, you can not continue in the next stage." "\nTry to change the configuration") # 5. Aggregation: f4 = Aggregation(premises_ux_by_class, cbin_train) output_aggregation = f4.aggregation(aggregation_method) premises_weights_names = output_aggregation[0] estimation_classes = output_aggregation[1] status = [0 if not i[0] else 1 for i in premises_weights_names] if sum(status) != number_classes: raise ValueError("Error in Aggregation Module. Some classes did not get premises. " "Sorry, you can not continue in the next stage." "\nTry to change the configuration") final_premises_classes = [] report.append("\n\nPremises:\n=========") for i in range(len(premises_weights_names)): report.append("Premises of Class " + str(i) + ": " + str(premises_weights_names[i][0])) final_premises_classes.append(premises_weights_names[i][0]) report.append("weights_" + str(i) + ": " + str(premises_weights_names[i][1].T)) # 6. Decision: f5 = Decisions(estimation_classes, freq_classes) train_bin_prediction = f5.dec_max_pert() # 7. Evaluation f6 = Evaluation(premises_weights_names, final_premises_classes, freq_classes) metrics_train = f6.eval_train(cbin_train, train_bin_prediction) metrics_test = f6.eval_test(cbin_test, ux_test, t_norm) report.append("\nEvaluation Training:\n---------------------------") report.append("Accuracy on train dataset: " + str(metrics_train[0])) report.append("AUC in train dataset: " + str(metrics_train[1])) report.append("Recall: " + str(metrics_train[3])) report.append('Confusion matrix:\n' + str(metrics_train[2])) report.append("\nEvaluation Testing:\n---------------------------") report.append("Accuracy on test dataset: " + str(metrics_test[0])) report.append("AUC in test dataset: " + str(metrics_test[1])) report.append("Recall: " + str(metrics_test[3])) report.append("Confusion matrix:\n" + str(metrics_test[2])) # Metrics to eval: accuracy_test, auc_test, # [num_regras, total_rule_length, tamano_medio_das_regras]] metricas = [1, [metrics_train[0], metrics_test[0], metrics_train[1], metrics_test[1], metrics_test[4]]] except ValueError as e: print e report = e # .append("\n" + str(e)) metricas = [0, "No se termino el proceso, se detuvo en algun etapa"] return report, metricas
def process_nom_features(self): """ 处理一批标称属性,获取所有可能的取值及其对应个数(包括缺失值的个数) :param feature_list: 属性列表 :return: 一个字典,key为所有可能取值,value为取值对应的个数 """ out_path = self.result_path association = Association() filename = self.dataset_path columns = [] dataload = pd.read_csv(filename) dataload['price'] = pd.cut( dataload['price'], [0, 8, 12, 16, 20, 24, 28, 32, 36, 60, 100, 3300]) dataload['points'] = pd.cut(dataload['points'], 20) dataload = dataload[[ 'country', 'points', 'price', 'province', 'region_1', 'variety' ]] for feature_name in dataload.keys(): print("Dealing with feature: {}".format(feature_name)) columns.append(list(dataload[feature_name])) rows = list(zip(*columns)) dataset = [] feature_names = list(dataload.keys()) for data_line in rows: data_set = [] for i, value in enumerate(data_line): if value == value: data_set.append((feature_names[i], value)) if data_set: dataset.append(data_set) freq_set, support_data = association.apriori(dataset) support_data_out = sorted(support_data.items(), key=lambda d: d[1], reverse=True) #print(support_data_out) big_rules_list = association.generate_rules(freq_set, support_data) big_rules_list = sorted(big_rules_list, key=lambda x: x[3], reverse=True) big_rules_list = sorted(big_rules_list, key=lambda x: x[4], reverse=True) #print(big_rules_list) freq_set_file = open('freq_set.json', 'w', encoding='utf-8') for (key, value) in support_data_out: result_dict = {'set': None, 'sup': None} set_result = list(key) sup_result = value result_dict['set'] = set_result result_dict['sup'] = sup_result json_str = json.dumps(result_dict, cls=MyEncoder) freq_set_file.write(json_str + '\n') freq_set_file.close() rules_file = open('rules.json', 'w', encoding='utf-8') for result in big_rules_list: result_dict = { 'X_set': None, 'Y_set': None, 'sup': None, 'conf': None, 'lift': None } X_set, Y_set, sup, conf, lift, cosine = result result_dict['X_set'] = list(X_set) result_dict['Y_set'] = list(Y_set) result_dict['sup'] = sup result_dict['conf'] = conf result_dict['lift'] = lift result_dict['cosine'] = cosine json_str = json.dumps(result_dict, cls=MyEncoder, ensure_ascii=False) rules_file.write(json_str + '\n') rules_file.close()